public RecordReader<Text, SequencedFragment> createRecordReader( InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { context.setStatus(genericSplit.toString()); return new QseqRecordReader( ContextUtil.getConfiguration(context), (FileSplit) genericSplit); // cast as per example in TextInputFormat }
@Override public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { if (split.getClass().equals(TableSplit.class)) { return tableInputFormat.createRecordReader(split, context); } else { return fileInputFormat.createRecordReader(split, context); } }
/** * Casts an InputSplit into a HCatSplit, providing a useful error message if the cast fails. * * @param split the InputSplit * @return the HCatSplit * @throws IOException */ public static HCatSplit castToHCatSplit(InputSplit split) throws IOException { if (split instanceof HCatSplit) { return (HCatSplit) split; } else { throw new IOException( "Split must be " + HCatSplit.class.getName() + " but found " + split.getClass().getName()); } }
@Override @SuppressWarnings("unchecked") public String[] getLocations() throws IOException, InterruptedException { if (locations == null) { HashMap<String, Long> locMap = new HashMap<String, Long>(); Long lenInMap; for (InputSplit split : wrappedSplits) { String[] locs = split.getLocations(); for (String loc : locs) { if ((lenInMap = locMap.get(loc)) == null) locMap.put(loc, split.getLength()); else locMap.put(loc, lenInMap + split.getLength()); } } Set<Map.Entry<String, Long>> entrySet = locMap.entrySet(); Map.Entry<String, Long>[] hostSize = entrySet.toArray(new Map.Entry[entrySet.size()]); Arrays.sort( hostSize, new Comparator<Map.Entry<String, Long>>() { @Override public int compare(Entry<String, Long> o1, Entry<String, Long> o2) { long diff = o1.getValue() - o2.getValue(); if (diff < 0) return 1; if (diff > 0) return -1; return 0; } }); // maximum 5 locations are in list: refer to PIG-1648 for more details int nHost = Math.min(hostSize.length, 5); locations = new String[nHost]; for (int i = 0; i < nHost; ++i) { locations[i] = hostSize[i].getKey(); } } return locations; }
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Path path = ((FileSplit) split).getPath(); Configuration conf = context.getConfiguration(); FileSystem fs = path.getFileSystem(conf); this.in = new SequenceFile.Reader(fs, path, conf); this.end = ((FileSplit) split).getStart() + split.getLength(); if (((FileSplit) split).getStart() > in.getPosition()) { in.sync(((FileSplit) split).getStart()); // sync to start } this.start = in.getPosition(); vbytes = in.createValueBytes(); done = start >= end; info = InputInfo.getInstance(); info.setSplit((FileSplit) split); System.err.println("input split = " + split); }
@Override public void initialize(InputSplit split, final TaskAttemptContext context) throws IOException, InterruptedException { org.apache.hadoop.mapred.InputSplit oldSplit; if (split.getClass() == FileSplit.class) { oldSplit = new org.apache.hadoop.mapred.FileSplit( ((FileSplit) split).getPath(), ((FileSplit) split).getStart(), ((FileSplit) split).getLength(), split.getLocations()); } else { oldSplit = ((InputSplitWrapper) split).realSplit; } @SuppressWarnings("unchecked") Reporter reporter = new Reporter() { // Reporter interface over ctx final TaskInputOutputContext ioCtx = context instanceof TaskInputOutputContext ? (TaskInputOutputContext) context : null; public void progress() { HadoopCompat.progress(context); } // @Override public float getProgress() { return (ioCtx != null) ? ioCtx.getProgress() : 0; } public void setStatus(String status) { if (ioCtx != null) HadoopCompat.setStatus(ioCtx, status); } public void incrCounter(String group, String counter, long amount) { if (ioCtx != null) HadoopCompat.incrementCounter(ioCtx.getCounter(group, counter), amount); } @SuppressWarnings("unchecked") public void incrCounter(Enum<?> key, long amount) { if (ioCtx != null) HadoopCompat.incrementCounter(ioCtx.getCounter(key), amount); } public org.apache.hadoop.mapred.InputSplit getInputSplit() throws UnsupportedOperationException { throw new UnsupportedOperationException(); } public Counter getCounter(String group, String name) { return ioCtx != null ? (Counter) HadoopCompat.getCounter(ioCtx, group, name) : null; } @SuppressWarnings("unchecked") public Counter getCounter(Enum<?> name) { return ioCtx != null ? (Counter) ioCtx.getCounter(name) : null; } }; realReader = realInputFormat.getRecordReader( oldSplit, (JobConf) HadoopCompat.getConfiguration(context), reporter); keyObj = realReader.createKey(); valueObj = realReader.createValue(); }
/** * Input Split의 파일명을 반환한다. Input Split은 기본적으로 <tt>file + ":" + start + "+" + length</tt> 형식으로 구성되어 * 있다. * * @param inputSplit Input Split * @return 파일명 */ public static String getFilename(InputSplit inputSplit) { String filename = org.openflamingo.mapreduce.util.FileUtils.getFilename(inputSplit.toString()); int start = filename.indexOf(":"); return filename.substring(0, start); }
// new API init call @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { context.setStatus(split.toString()); init((ESInputSplit) split, context.getConfiguration()); }
/** Returns the <code>RecordReader</code> for reading the arc file. */ public RecordReader<Text, ArcRecord> createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException { context.setStatus(split.toString()); return new ArcRecordReader(); }
@Override public String[] getLocations() throws IOException, InterruptedException { return delegate.getLocations(); }
@Override public long getLength() throws IOException, InterruptedException { return delegate.getLength(); }
@Override protected void map(String key, String value, final Context context) throws IOException, InterruptedException { final InputSplit split = context.getInputSplit(); if (!(split instanceof DatasourceInputSplit)) { throw new IAE( "Unexpected split type. Expected [%s] was [%s]", DatasourceInputSplit.class.getCanonicalName(), split.getClass().getCanonicalName()); } final String tmpDirLoc = context.getConfiguration().get(TMP_FILE_LOC_KEY); final File tmpDir = Paths.get(tmpDirLoc).toFile(); final DataSegment segment = Iterables.getOnlyElement(((DatasourceInputSplit) split).getSegments()).getSegment(); final HadoopDruidConverterConfig config = converterConfigFromConfiguration(context.getConfiguration()); context.setStatus("DOWNLOADING"); context.progress(); final Path inPath = new Path(JobHelper.getURIFromSegment(segment)); final File inDir = new File(tmpDir, "in"); if (inDir.exists() && !inDir.delete()) { log.warn("Could not delete [%s]", inDir); } if (!inDir.mkdir() && (!inDir.exists() || inDir.isDirectory())) { log.warn("Unable to make directory"); } final long inSize = JobHelper.unzipNoGuava(inPath, context.getConfiguration(), inDir, context); log.debug("Loaded %d bytes into [%s] for converting", inSize, inDir.getAbsolutePath()); context.getCounter(COUNTER_GROUP, COUNTER_LOADED).increment(inSize); context.setStatus("CONVERTING"); context.progress(); final File outDir = new File(tmpDir, "out"); if (!outDir.mkdir() && (!outDir.exists() || !outDir.isDirectory())) { throw new IOException(String.format("Could not create output directory [%s]", outDir)); } HadoopDruidConverterConfig.INDEX_MERGER.convert( inDir, outDir, config.getIndexSpec(), JobHelper.progressIndicatorForContext(context)); if (config.isValidate()) { context.setStatus("Validating"); HadoopDruidConverterConfig.INDEX_IO.validateTwoSegments(inDir, outDir); } context.progress(); context.setStatus("Starting PUSH"); final Path baseOutputPath = new Path(config.getSegmentOutputPath()); final FileSystem outputFS = baseOutputPath.getFileSystem(context.getConfiguration()); final DataSegment finalSegmentTemplate = segment.withVersion(segment.getVersion() + "_converted"); final DataSegment finalSegment = JobHelper.serializeOutIndex( finalSegmentTemplate, context.getConfiguration(), context, context.getTaskAttemptID(), outDir, JobHelper.makeSegmentOutputPath(baseOutputPath, outputFS, finalSegmentTemplate)); context.progress(); context.setStatus("Finished PUSH"); final String finalSegmentString = HadoopDruidConverterConfig.jsonMapper.writeValueAsString(finalSegment); context .getConfiguration() .set(ConvertingOutputFormat.PUBLISHED_SEGMENT_KEY, finalSegmentString); context.write(new Text("dataSegment"), new Text(finalSegmentString)); context.getCounter(COUNTER_GROUP, COUNTER_WRITTEN).increment(finalSegment.getSize()); context.progress(); context.setStatus("Ready To Commit"); }
@Override protected void initParser(InputSplit inSplit) throws IOException, InterruptedException { file = ((DelimitedSplit) inSplit).getPath(); configFileNameAsCollection(conf, file); // get header from the DelimitedSplit TextArrayWritable taw = ((DelimitedSplit) inSplit).getHeader(); fields = taw.toStrings(); fileIn = fs.open(file); lineSeparator = retrieveLineSeparator(fileIn); if (start != 0) { // in case the cut point is \n, back off 1 char to create a partial // line so that 1st line can be skipped start--; } fileIn.seek(start); instream = new InputStreamReader(fileIn, encoding); bytesRead = 0; fileLen = inSplit.getLength(); if (uriName == null) { generateId = conf.getBoolean(CONF_DELIMITED_GENERATE_URI, false); if (generateId) { idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart()); } else { uriId = 0; } } boolean found = generateId || uriId == 0; for (int i = 0; i < fields.length && !found; i++) { if (fields[i].equals(uriName)) { uriId = i; found = true; break; } } if (found == false) { // idname doesn't match any columns if (LOG.isDebugEnabled()) { LOG.debug("Header: " + convertToLine(fields)); } throw new IOException( "Delimited_uri_id " + uriName + " is not found in " + this.file.toUri().getPath()); } // keep leading and trailing whitespaces to ensure accuracy of pos // do not skip empty line just in case the split boundary is \n parser = new CSVParser( instream, new CSVStrategy( delimiter, encapsulator, CSVStrategy.COMMENTS_DISABLED, CSVStrategy.ESCAPE_DISABLED, false, false, false, false)); // skip first line: // 1st split, skip header; other splits, skip partial line String[] values = parser.getLine(); start += getBytesCountFromLine(values); pos = start; }