Example #1
0
  /** Check whether the file list have duplication. */
  private static void checkDuplication(FileSystem fs, Path file, Path sorted, Configuration conf)
      throws IOException {
    SequenceFile.Reader in = null;
    try {
      SequenceFile.Sorter sorter =
          new SequenceFile.Sorter(fs, new Text.Comparator(), Text.class, Text.class, conf);
      sorter.sort(file, sorted);
      in = new SequenceFile.Reader(fs, sorted, conf);

      Text prevdst = null, curdst = new Text();
      Text prevsrc = null, cursrc = new Text();
      for (; in.next(curdst, cursrc); ) {
        if (prevdst != null && curdst.equals(prevdst)) {
          throw new DuplicationException(
              "Invalid input, there are duplicated files in the sources: "
                  + prevsrc
                  + ", "
                  + cursrc);
        }
        prevdst = curdst;
        curdst = new Text();
        prevsrc = cursrc;
        cursrc = new Text();
      }
    } finally {
      checkAndClose(in);
    }
  }
Example #2
0
    /*
     * 将文档中心从hdfs中加载至内存
     */
    protected void setup(Context context) throws IOException, InterruptedException { // 读取中心点向量数据
      Configuration conf = context.getConfiguration();
      Path cents = new Path(CENT_PATH);
      // FileSystem fs = FileSystem.get(conf);
      FileSystem fs = cents.getFileSystem(conf);

      SequenceFile.Reader reader = new SequenceFile.Reader(fs, cents, conf);
      Text key = new Text(); // 读取题号
      Text value = new Text(); // 读取题号对应的单词=TFIDF,单词=TFIDF
      while (reader.next(key, value)) {
        Map<String, Double> tfidfAndword = new HashMap<String, Double>(); // 存储TFIDF和单词
        String[] strs = null;
        Pattern p = Pattern.compile("\"([^\"]+)\"=([^,}]+)"); // 正则匹配取出:单词和TFIDF
        Matcher m = p.matcher(value.toString());
        while (m.find()) {
          strs = m.group().split("=");
          if (strs.length == 2) {
            tfidfAndword.put(
                strs[0].replace("\"", "").trim(),
                Double.parseDouble(strs[1].replace("}", "").trim()));
          }
        }
        centers.put(key.toString(), tfidfAndword);
      }
      reader.close();

      super.setup(context);
    }
  /**
   * @param path
   * @param job
   * @param fs
   * @param dest
   * @param rlen
   * @param clen
   * @param brlen
   * @param bclen
   * @throws IOException
   * @throws IllegalAccessException
   * @throws InstantiationException
   */
  @SuppressWarnings("deprecation")
  private void readBinaryBlockMatrixBlocksFromHDFS(
      Path path,
      JobConf job,
      FileSystem fs,
      Collection<IndexedMatrixValue> dest,
      long rlen,
      long clen,
      int brlen,
      int bclen)
      throws IOException {
    MatrixIndexes key = new MatrixIndexes();
    MatrixBlock value = new MatrixBlock();

    // set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
      MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files
    {
      // directly read from sequence files (individual partfiles)
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job);

      try {
        while (reader.next(key, value)) {
          int row_offset = (int) (key.getRowIndex() - 1) * brlen;
          int col_offset = (int) (key.getColumnIndex() - 1) * bclen;
          int rows = value.getNumRows();
          int cols = value.getNumColumns();

          // bound check per block
          if (row_offset + rows < 0
              || row_offset + rows > rlen
              || col_offset + cols < 0
              || col_offset + cols > clen) {
            throw new IOException(
                "Matrix block ["
                    + (row_offset + 1)
                    + ":"
                    + (row_offset + rows)
                    + ","
                    + (col_offset + 1)
                    + ":"
                    + (col_offset + cols)
                    + "] "
                    + "out of overall matrix range [1:"
                    + rlen
                    + ",1:"
                    + clen
                    + "].");
          }

          // copy block to result
          dest.add(new IndexedMatrixValue(new MatrixIndexes(key), new MatrixBlock(value)));
        }
      } finally {
        IOUtilFunctions.closeSilently(reader);
      }
    }
  }
Example #4
0
  /**
   * return the x*y
   *
   * @param url
   * @return
   */
  public Double[] getR(String url) {
    List<Double> list = new ArrayList<Double>();
    Path path = new Path(url);
    Configuration conf = HUtils.getConf();
    SequenceFile.Reader reader = null;
    try {
      reader =
          new SequenceFile.Reader(
              conf, Reader.file(path), Reader.bufferSize(4096), Reader.start(0));
      DoubleArrStrWritable dkey =
          (DoubleArrStrWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
      DoublePairWritable dvalue =
          (DoublePairWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf);

      while (reader.next(dkey, dvalue)) { // 循环读取文件
        //				list.add(dvalue.getSum()*dvalue.getDistance());
      }
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      IOUtils.closeStream(reader);
    }
    Double[] dList = new Double[list.size()];
    dList = list.toArray(dList);
    Arrays.sort(dList);
    return dList;
  }
  /** determines which files have failed for a given job */
  private Set<String> getFailedFiles(Job job) throws IOException {
    Set<String> failedFiles = new HashSet<String>();

    Path outDir = SequenceFileOutputFormat.getOutputPath(job);
    FileSystem fs = outDir.getFileSystem(getConf());
    if (!fs.getFileStatus(outDir).isDir()) {
      throw new IOException(outDir.toString() + " is not a directory");
    }

    FileStatus[] files = fs.listStatus(outDir);

    for (FileStatus f : files) {
      Path fPath = f.getPath();
      if ((!f.isDir()) && (fPath.getName().startsWith(PART_PREFIX))) {
        LOG.info("opening " + fPath.toString());
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, fPath, getConf());

        Text key = new Text();
        Text value = new Text();
        while (reader.next(key, value)) {
          failedFiles.add(key.toString());
        }
        reader.close();
      }
    }
    return failedFiles;
  }
  /** debugging TODO remove */
  private void readOutputFiles(String jobName, Path outDir) throws IOException {

    FileSystem fs = outDir.getFileSystem(getConf());
    if (!fs.getFileStatus(outDir).isDir()) {
      throw new IOException(outDir.toString() + " is not a directory");
    }

    FileStatus[] files = fs.listStatus(outDir);

    for (FileStatus f : files) {
      Path fPath = f.getPath();
      if ((!f.isDir()) && (fPath.getName().startsWith(PART_PREFIX))) {
        LOG.info("opening " + fPath.toString());
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, fPath, getConf());
        Text key = new Text();
        Text value = new Text();
        while (reader.next(key, value)) {
          LOG.info("read " + f.getPath().toString());
          LOG.info("read: k=" + key.toString() + " v=" + value.toString());
        }
        LOG.info("done reading " + fPath.toString());

        reader.close();
      }
    }
  }
Example #7
0
  private int readFile() throws IllegalArgumentException, IOException {
    int count = 0;
    final FileSystem fs = FileSystem.get(MapReduceTestUtils.getConfiguration());
    final FileStatus[] fss =
        fs.listStatus(
            new Path(
                TestUtils.TEMP_DIR
                    + File.separator
                    + MapReduceTestEnvironment.HDFS_BASE_DIRECTORY
                    + "/t1/pairs"));
    for (final FileStatus ifs : fss) {
      if (ifs.isFile() && ifs.getPath().toString().matches(".*part-r-0000[0-9]")) {
        try (SequenceFile.Reader reader =
            new SequenceFile.Reader(
                MapReduceTestUtils.getConfiguration(), Reader.file(ifs.getPath()))) {

          final Text key = new Text();
          final Text val = new Text();

          while (reader.next(key, val)) {
            count++;
            System.err.println(key + "\t" + val);
          }
        }
      }
    }
    return count;
  }
  public static void run(Configuration conf, Path input, String outputFile)
      throws IOException, InstantiationException, IllegalAccessException {
    Writer writer;
    if (outputFile == null) {
      writer = new OutputStreamWriter(System.out);
    } else {
      writer =
          new OutputStreamWriter(
              new FileOutputStream(new File(outputFile)), Charset.forName("UTF-8"));
    }

    try {
      FileSystem fs = input.getFileSystem(conf);
      for (FileStatus fst : fs.listStatus(input, new DataPathFilter())) {
        Path dataPath = fst.getPath();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, dataPath, conf);
        try {
          Text key = reader.getKeyClass().asSubclass(Text.class).newInstance();
          DocumentMapping value = new DocumentMapping();
          while (reader.next(key, value)) {
            String docId = value.getDocId();
            writer.write(docId + "\t" + key + "\n");
          }
        } finally {
          reader.close();
        }
      }
    } finally {
      writer.close();
    }
  }
Example #9
0
  private static void finalize(
      Configuration conf, JobConf jobconf, final Path destPath, String presevedAttributes)
      throws IOException {
    if (presevedAttributes == null) {
      return;
    }
    EnumSet<FileAttribute> preseved = FileAttribute.parse(presevedAttributes);
    if (!preseved.contains(FileAttribute.USER)
        && !preseved.contains(FileAttribute.GROUP)
        && !preseved.contains(FileAttribute.PERMISSION)) {
      return;
    }

    FileSystem dstfs = destPath.getFileSystem(conf);
    Path dstdirlist = new Path(jobconf.get(DST_DIR_LIST_LABEL));
    SequenceFile.Reader in = null;
    try {
      in = new SequenceFile.Reader(dstdirlist.getFileSystem(jobconf), dstdirlist, jobconf);
      Text dsttext = new Text();
      FilePair pair = new FilePair();
      for (; in.next(dsttext, pair); ) {
        Path absdst = new Path(destPath, pair.output);
        updatePermissions(pair.input, dstfs.getFileStatus(absdst), preseved, dstfs);
      }
    } finally {
      checkAndClose(in);
    }
  }
  /**
   * Result file contains hierarchy of workerID-resultvar(incl filename). We deduplicate on the
   * workerID. Without JVM reuse each task refers to a unique workerID, so we will not find any
   * duplicates. With JVM reuse, however, each slot refers to a workerID, and there are duplicate
   * filenames due to partial aggregation and overwrite of fname (the RemoteParWorkerMapper ensures
   * uniqueness of those files independent of the runtime implementation).
   *
   * @param job
   * @param fname
   * @return
   * @throws DMLRuntimeException
   */
  @SuppressWarnings("deprecation")
  public static LocalVariableMap[] readResultFile(JobConf job, String fname)
      throws DMLRuntimeException, IOException {
    HashMap<Long, LocalVariableMap> tmp = new HashMap<Long, LocalVariableMap>();

    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fname);
    LongWritable key = new LongWritable(); // workerID
    Text value = new Text(); // serialized var header (incl filename)

    int countAll = 0;
    for (Path lpath : MatrixReader.getSequenceFilePaths(fs, path)) {
      SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(job), lpath, job);
      try {
        while (reader.next(key, value)) {
          // System.out.println("key="+key.get()+", value="+value.toString());
          if (!tmp.containsKey(key.get())) tmp.put(key.get(), new LocalVariableMap());
          Object[] dat = ProgramConverter.parseDataObject(value.toString());
          tmp.get(key.get()).put((String) dat[0], (Data) dat[1]);
          countAll++;
        }
      } finally {
        if (reader != null) reader.close();
      }
    }

    LOG.debug("Num remote worker results (before deduplication): " + countAll);
    LOG.debug("Num remote worker results: " + tmp.size());

    // create return array
    return tmp.values().toArray(new LocalVariableMap[0]);
  }
Example #11
0
  public static XYSeries getXY(String url) {
    XYSeries xyseries = new XYSeries("");

    Path path = new Path(url);
    Configuration conf = HUtils.getConf();
    SequenceFile.Reader reader = null;
    try {
      reader =
          new SequenceFile.Reader(
              conf, Reader.file(path), Reader.bufferSize(4096), Reader.start(0));
      DoubleArrStrWritable dkey =
          (DoubleArrStrWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
      DoublePairWritable dvalue =
          (DoublePairWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf);

      while (reader.next(dkey, dvalue)) { // 循环读取文件
        xyseries.add(dvalue.getFirst(), dvalue.getSecond());
      }
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      IOUtils.closeStream(reader);
    }
    return xyseries;
  }
  public void configure(JobConf conf) {
    numberOfCenters = Integer.valueOf(conf.get("numberOfCenters"));
    centersDirectory = conf.get("centersReadDirectory");

    try {
      Configuration c = new Configuration();
      FileSystem fs = FileSystem.get(c);

      for (int index = 0; index < numberOfCenters; ++index) {
        SequenceFile.Reader reader =
            new SequenceFile.Reader(fs, new Path(centersDirectory + "/centers/" + index), c);

        LongWritable key = new LongWritable();
        Point value = new Point();

        reader.next(key, value);

        Point center = (Point) value;

        centers.add(center);

        reader.close();
      }
    } catch (IOException e) {
      // do nothing
      // I hope this doesn't happen
      System.out.println("well, damn.");
      e.printStackTrace();
    }
  }
Example #13
0
  public static void main(String args[]) throws Exception {

    String inputDir = "reuters";
    int k = 25;

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    String vectorsFolder = inputDir + "/tfidf-vectors";
    SequenceFile.Reader reader =
        new SequenceFile.Reader(fs, new Path(vectorsFolder + "/part-r-00000"), conf);
    List<Vector> points = new ArrayList<Vector>();
    Text key = new Text();
    VectorWritable value = new VectorWritable();

    while (reader.next(key, value)) {
      points.add(value.get());
    }
    System.out.println(points.size());
    reader.close();
    List<Vector> randomPoints = RandomPointsUtil.chooseRandomPoints(points, k);
    List<Cluster> clusters = new ArrayList<Cluster>();
    System.out.println(randomPoints.size());
    int clusterId = 0;
    for (Vector v : randomPoints) {
      clusters.add(new Cluster(v, clusterId++, new CosineDistanceMeasure()));
    }

    List<List<Cluster>> finalClusters =
        KMeansClusterer.clusterPoints(points, clusters, new CosineDistanceMeasure(), 10, 0.01);
    for (Cluster cluster : finalClusters.get(finalClusters.size() - 1)) {
      System.out.println(
          "Cluster id: " + cluster.getId() + " center: " + cluster.getCenter().asFormatString());
    }
  }
  public static Canopies readCanopyCenters(Configuration conf) throws IOException {
    Canopies canopies = new Canopies();

    FileSystem fs = FileSystem.get(conf);

    Path canopyFileName = new Path(Nasdaq.CANOPY_SEQ_FILE_PATH);
    // init canopies
    @SuppressWarnings("deprecation")
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, canopyFileName, conf);

    StockVector canopy = new StockVector();
    IntWritable value = new IntWritable();

    while (reader.next(canopy, value)) {
      // parse the canopy center
      StockVector canopyToAdd = new StockVector(canopy);
      // add to canopy centers
      canopies.addCanopy(canopyToAdd);
    }

    reader.close();
    // fs.close();

    return canopies;
  }
 public void performSequenceFileRead(FileSystem fs, int count, Path file) throws IOException {
   SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);
   ByteWritable key = new ByteWritable();
   BytesRefArrayWritable val = new BytesRefArrayWritable();
   for (int i = 0; i < count; i++) {
     reader.next(key, val);
   }
 }
Example #16
0
 public SeqFileInputStream(FileSystem fs, FileStatus f) throws IOException {
   r = new SequenceFile.Reader(fs, f.getPath(), getConf());
   key =
       ReflectionUtils.newInstance(
           r.getKeyClass().asSubclass(WritableComparable.class), getConf());
   val = ReflectionUtils.newInstance(r.getValueClass().asSubclass(Writable.class), getConf());
   inbuf = new DataInputBuffer();
   outbuf = new DataOutputBuffer();
 }
  private List<InputSplit> getSplits(
      Configuration configuration, int numSplits, long totalSizeBytes) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
    long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits);

    CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
    Text srcRelPath = new Text();
    long currentSplitSize = 0;
    long lastSplitStart = 0;
    long lastPosition = 0;

    final Path listingFilePath = getListingFilePath(configuration);

    if (LOG.isDebugEnabled()) {
      LOG.debug(
          "Average bytes per map: "
              + nBytesPerSplit
              + ", Number of maps: "
              + numSplits
              + ", total size: "
              + totalSizeBytes);
    }
    SequenceFile.Reader reader = null;
    try {
      reader = getListingFileReader(configuration);
      while (reader.next(srcRelPath, srcFileStatus)) {
        // If adding the current file would cause the bytes per map to exceed
        // limit. Add the current file to new split
        if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) {
          FileSplit split =
              new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null);
          if (LOG.isDebugEnabled()) {
            LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize);
          }
          splits.add(split);
          lastSplitStart = lastPosition;
          currentSplitSize = 0;
        }
        currentSplitSize += srcFileStatus.getLen();
        lastPosition = reader.getPosition();
      }
      if (lastPosition > lastSplitStart) {
        FileSplit split =
            new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null);
        if (LOG.isDebugEnabled()) {
          LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize);
        }
        splits.add(split);
      }

    } finally {
      IOUtils.closeStream(reader);
    }

    return splits;
  }
 private static ClusterClassifier readClassifier(Configuration config, Path path, FileSystem fs)
     throws IOException {
   SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, config);
   Writable key = new Text();
   ClusterClassifier classifierOut = new ClusterClassifier();
   try {
     reader.next(key, classifierOut);
   } finally {
     Closeables.closeQuietly(reader);
   }
   return classifierOut;
 }
Example #19
0
  private static List<IDistanceDensityMul> getIDistanceDensityMulList(String url)
      throws FileNotFoundException, IOException {
    Configuration conf = HUtils.getConf();
    SequenceFile.Reader reader = null;
    // 多个文件整合,需排序
    List<IDistanceDensityMul> allList = new ArrayList<IDistanceDensityMul>();
    // 单个文件
    List<IDistanceDensityMul> fileList = new ArrayList<IDistanceDensityMul>();

    FileStatus[] fss =
        HUtils.getHDFSPath(url, "true")
            .getFileSystem(conf)
            .listStatus(HUtils.getHDFSPath(url, "true"));
    for (FileStatus f : fss) {
      if (!f.toString().contains("part")) {
        continue; // 排除其他文件
      }
      try {
        reader =
            new SequenceFile.Reader(
                conf, Reader.file(f.getPath()), Reader.bufferSize(4096), Reader.start(0));
        //				 <density_i*min_distancd_j> <first:density_i,second:min_distance_j,third:i>
        //				 	DoubleWritable,  IntDoublePairWritable
        CustomDoubleWritable dkey =
            (CustomDoubleWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
        IntDoublePairWritable dvalue =
            (IntDoublePairWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
        int i = Utils.GETDRAWPICRECORDS_EVERYFILE;
        while (reader.next(dkey, dvalue) && i > 0) { // 循环读取文件
          i--;
          fileList.add(
              new IDistanceDensityMul(
                  dvalue.getSecond(),
                  dvalue.getFirst(),
                  dvalue.getThird(),
                  dkey.get())); // 每个文件都是从小到大排序的
        }
      } catch (Exception e) {
        e.printStackTrace();
      } finally {
        IOUtils.closeStream(reader);
      }

      // 整合当前文件的前面若干条记录(Utils.GETDRAWPICRECORDS_EVERYFILE 	)
      if (allList.size() <= 0) { // 第一次可以全部添加
        allList.addAll(fileList);
      } else {
        combineLists(allList, fileList);
      }
    } // for
    // 第一个点太大了,选择去掉
    return allList.subList(1, allList.size());
  }
 private Map<Text, CopyListingFileStatus> getListing(Path listingPath) throws Exception {
   SequenceFile.Reader reader =
       new SequenceFile.Reader(conf, SequenceFile.Reader.file(listingPath));
   Text key = new Text();
   CopyListingFileStatus value = new CopyListingFileStatus();
   Map<Text, CopyListingFileStatus> values = new HashMap<>();
   while (reader.next(key, value)) {
     values.put(key, value);
     key = new Text();
     value = new CopyListingFileStatus();
   }
   return values;
 }
  private List<String> getKeyFromSequenceFile(FileSystem fs, Path path, Configuration conf)
      throws Exception {
    List<String> list = new ArrayList<String>();
    SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(path));

    String next = (String) reader.next((String) null);
    while (next != null) {
      list.add(next);
      next = (String) reader.next((String) null);
    }
    reader.close();
    return list;
  }
Example #22
0
 public static void main(String[] args) throws Exception {
   // TODO Auto-generated method stub
   String mapUri = args[0];
   Configuration conf = new Configuration();
   FileSystem fs = FileSystem.get(URI.create(mapUri), conf);
   Path map = new Path(mapUri);
   Path mapData = new Path(mapUri, MapFile.DATA_FILE_NAME);
   SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(mapData));
   Class keyClass = reader.getKeyClass();
   Class valueClass = reader.getValueClass();
   reader.close();
   long entries = MapFile.fix(fs, map, keyClass, valueClass, false, conf);
   System.out.printf("Created MapFile %s with %d entries\n", map, entries);
 }
Example #23
0
 private int getMessageCount(LogFilePath logFilePath) throws Exception {
   String path = logFilePath.getLogFilePath();
   Path fsPath = new Path(path);
   FileSystem fileSystem = FileUtil.getFileSystem(path);
   SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, fsPath, new Configuration());
   LongWritable key = (LongWritable) reader.getKeyClass().newInstance();
   BytesWritable value = (BytesWritable) reader.getValueClass().newInstance();
   int result = 0;
   while (reader.next(key, value)) {
     result++;
   }
   reader.close();
   return result;
 }
Example #24
0
    /**
     * Produce splits such that each is no greater than the quotient of the total size and the
     * number of splits requested.
     *
     * @param job The handle to the JobConf object
     * @param numSplits Number of splits requested
     */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
      int cnfiles = job.getInt(SRC_COUNT_LABEL, -1);
      long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1);
      String srcfilelist = job.get(SRC_LIST_LABEL, "");
      if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) {
        throw new RuntimeException(
            "Invalid metadata: #files("
                + cnfiles
                + ") total_size("
                + cbsize
                + ") listuri("
                + srcfilelist
                + ")");
      }
      Path src = new Path(srcfilelist);
      FileSystem fs = src.getFileSystem(job);
      FileStatus srcst = fs.getFileStatus(src);

      ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
      LongWritable key = new LongWritable();
      FilePair value = new FilePair();
      final long targetsize = cbsize / numSplits;
      long pos = 0L;
      long last = 0L;
      long acc = 0L;
      long cbrem = srcst.getLen();
      SequenceFile.Reader sl = null;
      try {
        sl = new SequenceFile.Reader(fs, src, job);
        for (; sl.next(key, value); last = sl.getPosition()) {
          // if adding this split would put this split past the target size,
          // cut the last split and put this next file in the next split.
          if (acc + key.get() > targetsize && acc != 0) {
            long splitsize = last - pos;
            splits.add(new FileSplit(src, pos, splitsize, (String[]) null));
            cbrem -= splitsize;
            pos = last;
            acc = 0L;
          }
          acc += key.get();
        }
      } finally {
        checkAndClose(sl);
      }
      if (cbrem != 0) {
        splits.add(new FileSplit(src, pos, cbrem, (String[]) null));
      }

      return splits.toArray(new FileSplit[splits.size()]);
    }
 protected Vector fetchVector(Path p, int keyIndex) throws IOException {
   if (!fs.exists(p)) {
     return null;
   }
   SequenceFile.Reader reader = new SequenceFile.Reader(fs, p, conf);
   IntWritable key = new IntWritable();
   VectorWritable vw = new VectorWritable();
   while (reader.next(key, vw)) {
     if (key.get() == keyIndex) {
       return vw.get();
     }
   }
   return null;
 }
Example #26
0
 private void getOffsets(LogFilePath logFilePath, Set<Long> offsets) throws Exception {
   String path = logFilePath.getLogFilePath();
   Path fsPath = new Path(path);
   FileSystem fileSystem = FileUtil.getFileSystem(path);
   SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, fsPath, new Configuration());
   LongWritable key = (LongWritable) reader.getKeyClass().newInstance();
   BytesWritable value = (BytesWritable) reader.getValueClass().newInstance();
   while (reader.next(key, value)) {
     if (!offsets.add(key.get())) {
       throw new RuntimeException(
           "duplicate key " + key.get() + " found in file " + logFilePath.getLogFilePath());
     }
   }
   reader.close();
 }
  private static Vector loadVector(Configuration conf, Path path) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
    Writable key = new IntWritable();
    VectorWritable value = new VectorWritable();

    try {
      if (!reader.next(key, value)) {
        throw new IOException("Input vector file is empty.");
      }
      return value.get();
    } finally {
      reader.close();
    }
  }
 /**
  * Return the progress within the input split
  *
  * @return 0.0 to 1.0 of the input byte range
  */
 public float getProgress() throws IOException, InterruptedException {
   if (end == start) {
     return 0.0f;
   } else {
     return Math.min(1.0f, (float) ((in.getPosition() - start) / (double) (end - start)));
   }
 }
  private MedoidSet readMedoidsSet(Path input, Configuration config)
      throws IOException, IllegalAccessException, InstantiationException {

    FileSystem fs = FileSystem.get(config);
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, input, config);
    MedoidSet set = new MedoidSet();

    try {
      Writable key = (Writable) reader.getKeyClass().newInstance();
      reader.next(key, set);
    } finally {
      IOUtils.quietClose(reader);
    }

    log.debug("Read initial medoid set:" + set);
    return set;
  }
Example #30
0
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
      super.setup(context);
      Configuration conf = context.getConfiguration();
      Path centroids = new Path(conf.get(CENTERS_CONF_KEY));
      FileSystem fs = FileSystem.get(conf);

      SequenceFile.Reader reader = new SequenceFile.Reader(fs, centroids, conf);

      Centroid key = new Centroid();
      IntWritable value = new IntWritable();
      while (reader.next(key, value)) {
        Centroid clusterCenter = new Centroid(key);
        centers.add(clusterCenter);
      }
      reader.close();
    }