@Override @SuppressWarnings("unchecked") public String[] getLocations() throws IOException, InterruptedException { if (locations == null) { HashMap<String, Long> locMap = new HashMap<String, Long>(); Long lenInMap; for (InputSplit split : wrappedSplits) { String[] locs = split.getLocations(); for (String loc : locs) { if ((lenInMap = locMap.get(loc)) == null) locMap.put(loc, split.getLength()); else locMap.put(loc, lenInMap + split.getLength()); } } Set<Map.Entry<String, Long>> entrySet = locMap.entrySet(); Map.Entry<String, Long>[] hostSize = entrySet.toArray(new Map.Entry[entrySet.size()]); Arrays.sort( hostSize, new Comparator<Map.Entry<String, Long>>() { @Override public int compare(Entry<String, Long> o1, Entry<String, Long> o2) { long diff = o1.getValue() - o2.getValue(); if (diff < 0) return 1; if (diff > 0) return -1; return 0; } }); // maximum 5 locations are in list: refer to PIG-1648 for more details int nHost = Math.min(hostSize.length, 5); locations = new String[nHost]; for (int i = 0; i < nHost; ++i) { locations[i] = hostSize[i].getKey(); } } return locations; }
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Path path = ((FileSplit) split).getPath(); Configuration conf = context.getConfiguration(); FileSystem fs = path.getFileSystem(conf); this.in = new SequenceFile.Reader(fs, path, conf); this.end = ((FileSplit) split).getStart() + split.getLength(); if (((FileSplit) split).getStart() > in.getPosition()) { in.sync(((FileSplit) split).getStart()); // sync to start } this.start = in.getPosition(); vbytes = in.createValueBytes(); done = start >= end; info = InputInfo.getInstance(); info.setSplit((FileSplit) split); System.err.println("input split = " + split); }
@Override public long getLength() throws IOException, InterruptedException { return delegate.getLength(); }
@Override protected void initParser(InputSplit inSplit) throws IOException, InterruptedException { file = ((DelimitedSplit) inSplit).getPath(); configFileNameAsCollection(conf, file); // get header from the DelimitedSplit TextArrayWritable taw = ((DelimitedSplit) inSplit).getHeader(); fields = taw.toStrings(); fileIn = fs.open(file); lineSeparator = retrieveLineSeparator(fileIn); if (start != 0) { // in case the cut point is \n, back off 1 char to create a partial // line so that 1st line can be skipped start--; } fileIn.seek(start); instream = new InputStreamReader(fileIn, encoding); bytesRead = 0; fileLen = inSplit.getLength(); if (uriName == null) { generateId = conf.getBoolean(CONF_DELIMITED_GENERATE_URI, false); if (generateId) { idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart()); } else { uriId = 0; } } boolean found = generateId || uriId == 0; for (int i = 0; i < fields.length && !found; i++) { if (fields[i].equals(uriName)) { uriId = i; found = true; break; } } if (found == false) { // idname doesn't match any columns if (LOG.isDebugEnabled()) { LOG.debug("Header: " + convertToLine(fields)); } throw new IOException( "Delimited_uri_id " + uriName + " is not found in " + this.file.toUri().getPath()); } // keep leading and trailing whitespaces to ensure accuracy of pos // do not skip empty line just in case the split boundary is \n parser = new CSVParser( instream, new CSVStrategy( delimiter, encapsulator, CSVStrategy.COMMENTS_DISABLED, CSVStrategy.ESCAPE_DISABLED, false, false, false, false)); // skip first line: // 1st split, skip header; other splits, skip partial line String[] values = parser.getLine(); start += getBytesCountFromLine(values); pos = start; }