Example #1
0
  @Override
  @SuppressWarnings("unchecked")
  public String[] getLocations() throws IOException, InterruptedException {
    if (locations == null) {
      HashMap<String, Long> locMap = new HashMap<String, Long>();
      Long lenInMap;
      for (InputSplit split : wrappedSplits) {
        String[] locs = split.getLocations();
        for (String loc : locs) {
          if ((lenInMap = locMap.get(loc)) == null) locMap.put(loc, split.getLength());
          else locMap.put(loc, lenInMap + split.getLength());
        }
      }
      Set<Map.Entry<String, Long>> entrySet = locMap.entrySet();
      Map.Entry<String, Long>[] hostSize = entrySet.toArray(new Map.Entry[entrySet.size()]);
      Arrays.sort(
          hostSize,
          new Comparator<Map.Entry<String, Long>>() {

            @Override
            public int compare(Entry<String, Long> o1, Entry<String, Long> o2) {
              long diff = o1.getValue() - o2.getValue();
              if (diff < 0) return 1;
              if (diff > 0) return -1;
              return 0;
            }
          });
      // maximum 5 locations are in list: refer to PIG-1648 for more details
      int nHost = Math.min(hostSize.length, 5);
      locations = new String[nHost];
      for (int i = 0; i < nHost; ++i) {
        locations[i] = hostSize[i].getKey();
      }
    }
    return locations;
  }
    public void initialize(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException {
      Path path = ((FileSplit) split).getPath();
      Configuration conf = context.getConfiguration();
      FileSystem fs = path.getFileSystem(conf);
      this.in = new SequenceFile.Reader(fs, path, conf);
      this.end = ((FileSplit) split).getStart() + split.getLength();
      if (((FileSplit) split).getStart() > in.getPosition()) {
        in.sync(((FileSplit) split).getStart()); // sync to start
      }
      this.start = in.getPosition();
      vbytes = in.createValueBytes();
      done = start >= end;

      info = InputInfo.getInstance();
      info.setSplit((FileSplit) split);

      System.err.println("input split = " + split);
    }
Example #3
0
 @Override
 public long getLength() throws IOException, InterruptedException {
   return delegate.getLength();
 }
  @Override
  protected void initParser(InputSplit inSplit) throws IOException, InterruptedException {
    file = ((DelimitedSplit) inSplit).getPath();
    configFileNameAsCollection(conf, file);

    // get header from the DelimitedSplit
    TextArrayWritable taw = ((DelimitedSplit) inSplit).getHeader();
    fields = taw.toStrings();

    fileIn = fs.open(file);
    lineSeparator = retrieveLineSeparator(fileIn);
    if (start != 0) {
      // in case the cut point is \n, back off 1 char to create a partial
      // line so that 1st line can be skipped
      start--;
    }

    fileIn.seek(start);

    instream = new InputStreamReader(fileIn, encoding);

    bytesRead = 0;
    fileLen = inSplit.getLength();
    if (uriName == null) {
      generateId = conf.getBoolean(CONF_DELIMITED_GENERATE_URI, false);
      if (generateId) {
        idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart());
      } else {
        uriId = 0;
      }
    }

    boolean found = generateId || uriId == 0;

    for (int i = 0; i < fields.length && !found; i++) {
      if (fields[i].equals(uriName)) {
        uriId = i;
        found = true;
        break;
      }
    }
    if (found == false) {
      // idname doesn't match any columns
      if (LOG.isDebugEnabled()) {
        LOG.debug("Header: " + convertToLine(fields));
      }
      throw new IOException(
          "Delimited_uri_id " + uriName + " is not found in " + this.file.toUri().getPath());
    }

    // keep leading and trailing whitespaces to ensure accuracy of pos
    // do not skip empty line just in case the split boundary is \n
    parser =
        new CSVParser(
            instream,
            new CSVStrategy(
                delimiter,
                encapsulator,
                CSVStrategy.COMMENTS_DISABLED,
                CSVStrategy.ESCAPE_DISABLED,
                false,
                false,
                false,
                false));

    // skip first line:
    // 1st split, skip header; other splits, skip partial line
    String[] values = parser.getLine();
    start += getBytesCountFromLine(values);
    pos = start;
  }