예제 #1
0
  public static Vocabulary load(Path vocabFile, FileSystem fs) throws IOException {

    BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(vocabFile)));
    int size = 0;
    String line;
    while ((line = in.readLine()) != null) {
      String[] columns = StringUtils.tokenize(line, "\t");
      int n = Integer.parseInt(columns[1]);
      if (n > 0) {
        size++;
      }
    }
    in.close();

    in = new BufferedReader(new InputStreamReader(fs.open(vocabFile)));
    Vocabulary v = new Vocabulary(size);
    while ((line = in.readLine()) != null) {
      String[] columns = StringUtils.tokenize(line, "\t");
      String tok = columns[0];
      int n = Integer.parseInt(columns[1]);
      if (n > 0) {
        v.vocab[n - 1] = tok;
      }
    }
    in.close();
    return v;
  }
예제 #2
0
  public static void main(String[] args) throws IOException {
    String uri = args[0];

    Configuration configuration = new Configuration();
    System.out.println("Trying to get the file system object");
    URI uriObj = URI.create(uri);
    System.out.println("Got URI object " + uri);
    FileSystem fs = FileSystem.get(uriObj, configuration);
    FSDataInputStream fsDataInputStream = null;

    Path hdfsPath = new Path(uri);

    fsDataInputStream = fs.open(hdfsPath);
    // This specifies the reading starts from the 0th Byte.
    fsDataInputStream.seek(0);
    IOUtils.copyBytes(fsDataInputStream, System.out, 4096, false);
    System.out.println("*******************************************");

    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(hdfsPath)));

    try {
      String line;
      line = br.readLine();
      while (line != null) {
        System.out.println("################ Line is###### " + line);
        // be sure to read the next line otherwise you'll get an infinite loop
        line = br.readLine();
      }
    } finally {
      // you should close out the BufferedReader
      br.close();
    }
  }
예제 #3
0
    @Override
    public void setup(Context context) throws IOException, InterruptedException {

      LOGGER.info("TopKRollupPhaseOneJob.TopKRollupPhaseOneReducer.setup()");

      Configuration configuration = context.getConfiguration();
      FileSystem fileSystem = FileSystem.get(configuration);
      Path configPath = new Path(configuration.get(TOPK_ROLLUP_PHASE1_CONFIG_PATH.toString()));
      try {
        starTreeConfig = StarTreeConfig.decode(fileSystem.open(configPath));
        config = TopKRollupPhaseOneConfig.fromStarTreeConfig(starTreeConfig);
        dimensionNames = config.getDimensionNames();
        metricTypes = config.getMetricTypes();
        metricSchema = new MetricSchema(config.getMetricNames(), metricTypes);
        metricThresholds = config.getMetricThresholds();
        keyWritable = new BytesWritable();
        valWritable = new BytesWritable();

        MetricSums metricSumsObj =
            OBJECT_MAPPER.readValue(
                fileSystem.open(
                    new Path(configuration.get(TOPK_ROLLUP_PHASE1_METRIC_SUMS_PATH.toString()))),
                MetricSums.class);
        metricSums = metricSumsObj.getMetricSum();

      } catch (Exception e) {
        throw new IOException(e);
      }
    }
예제 #4
0
 private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) {
   try {
     if (fs == null) return;
     for (FileStatus file : fs.listStatus(p)) {
       Path pfs = file.getPath();
       if (file.isDir()) {
         addFolder(fs, pfs, succeeded, failed);
       } else {
         Key k = Key.make(pfs.toString());
         long size = file.getLen();
         Value val = null;
         if (pfs.getName().endsWith(Extensions.JSON)) {
           JsonParser parser = new JsonParser();
           JsonObject json = parser.parse(new InputStreamReader(fs.open(pfs))).getAsJsonObject();
           JsonElement v = json.get(Constants.VERSION);
           if (v == null) throw new RuntimeException("Missing version");
           JsonElement type = json.get(Constants.TYPE);
           if (type == null) throw new RuntimeException("Missing type");
           Class c = Class.forName(type.getAsString());
           OldModel model = (OldModel) c.newInstance();
           model.fromJson(json);
         } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file?
           FSDataInputStream s = fs.open(pfs);
           int sz = (int) Math.min(1L << 20, size); // Read up to the 1st meg
           byte[] mem = MemoryManager.malloc1(sz);
           s.readFully(mem);
           // Convert to a ValueArray (hope it fits in 1Meg!)
           ValueArray ary = new ValueArray(k, 0).read(new AutoBuffer(mem));
           val = new Value(k, ary, Value.HDFS);
         } else if (size >= 2 * ValueArray.CHUNK_SZ) {
           val =
               new Value(
                   k,
                   new ValueArray(k, size),
                   Value.HDFS); // ValueArray byte wrapper over a large file
         } else {
           val = new Value(k, (int) size, Value.HDFS); // Plain Value
           val.setdsk();
         }
         DKV.put(k, val);
         Log.info("PersistHdfs: DKV.put(" + k + ")");
         JsonObject o = new JsonObject();
         o.addProperty(Constants.KEY, k.toString());
         o.addProperty(Constants.FILE, pfs.toString());
         o.addProperty(Constants.VALUE_SIZE, file.getLen());
         succeeded.add(o);
       }
     }
   } catch (Exception e) {
     Log.err(e);
     JsonObject o = new JsonObject();
     o.addProperty(Constants.FILE, p.toString());
     o.addProperty(Constants.ERROR, e.getMessage());
     failed.add(o);
   }
 }
  /** This tests {@link StringWriter} with non-rolling output. */
  @Test
  public void testNonRollingStringWriter() throws Exception {
    final int NUM_ELEMENTS = 20;
    final int PARALLELISM = 2;
    final String outPath = hdfsURI + "/string-non-rolling-out";
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(PARALLELISM);

    DataStream<Tuple2<Integer, String>> source =
        env.addSource(new TestSourceFunction(NUM_ELEMENTS)).broadcast().filter(new OddEvenFilter());

    RollingSink<String> sink =
        new RollingSink<String>(outPath)
            .setBucketer(new NonRollingBucketer())
            .setPartPrefix("part")
            .setPendingPrefix("")
            .setPendingSuffix("");

    source
        .map(
            new MapFunction<Tuple2<Integer, String>, String>() {
              private static final long serialVersionUID = 1L;

              @Override
              public String map(Tuple2<Integer, String> value) throws Exception {
                return value.f1;
              }
            })
        .addSink(sink);

    env.execute("RollingSink String Write Test");

    FSDataInputStream inStream = dfs.open(new Path(outPath + "/part-0-0"));

    BufferedReader br = new BufferedReader(new InputStreamReader(inStream));

    for (int i = 0; i < NUM_ELEMENTS; i += 2) {
      String line = br.readLine();
      Assert.assertEquals("message #" + i, line);
    }

    inStream.close();

    inStream = dfs.open(new Path(outPath + "/part-1-0"));

    br = new BufferedReader(new InputStreamReader(inStream));

    for (int i = 1; i < NUM_ELEMENTS; i += 2) {
      String line = br.readLine();
      Assert.assertEquals("message #" + i, line);
    }

    inStream.close();
  }
예제 #6
0
  /**
   * Generates the Dataset by parsing the entire data
   *
   * @param descriptor attributes description
   * @param regression if true, the label is numerical
   * @param fs file system
   * @param path data path
   */
  public static Dataset generateDataset(
      CharSequence descriptor, boolean regression, FileSystem fs, Path path)
      throws DescriptorException, IOException {
    Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);

    FSDataInputStream input = fs.open(path);
    Scanner scanner = new Scanner(input, "UTF-8");

    // used to convert CATEGORICAL attribute to Integer
    @SuppressWarnings("unchecked")
    Set<String>[] valsets = new Set[attrs.length];

    int size = 0;
    while (scanner.hasNextLine()) {
      String line = scanner.nextLine();
      if (!line.isEmpty()) {
        if (parseString(attrs, valsets, line, regression)) {
          size++;
        }
      }
    }

    scanner.close();

    @SuppressWarnings("unchecked")
    List<String>[] values = new List[attrs.length];
    for (int i = 0; i < valsets.length; i++) {
      if (valsets[i] != null) {
        values[i] = Lists.newArrayList(valsets[i]);
      }
    }

    return new Dataset(attrs, values, size, regression);
  }
예제 #7
0
  /**
   * Runs a range query on the local machine by iterating over the whole file.
   *
   * @param fs - FileSystem that contains input file
   * @param file - path to the input file
   * @param queryRange - The range to look in
   * @param shape - An instance of the shape stored in file
   * @param output - Output is sent to this collector. If <code>null</code>, output is not collected
   *     and only the number of results is returned.
   * @return number of results found
   * @throws IOException
   */
  public static <S extends Shape> long rangeQueryLocal(
      FileSystem fs, Path file, Shape queryRange, S shape, ResultCollector<S> output)
      throws IOException {
    long file_size = fs.getFileStatus(file).getLen();
    ShapeRecordReader<S> shapeReader = new ShapeRecordReader<S>(fs.open(file), 0, file_size);

    long resultCount = 0;
    Prism cell = shapeReader.createKey();

    while (shapeReader.next(cell, shape)) {
      if (shape.isIntersected(queryRange)) {
        boolean report_result;
        if (cell.isValid()) {
          // Check for duplicate avoidance
          Prism intersection_mbr = queryRange.getMBR().getIntersection(shape.getMBR());
          report_result =
              cell.contains(intersection_mbr.t1, intersection_mbr.x1, intersection_mbr.y1);
        } else {
          report_result = true;
        }
        if (report_result) {
          resultCount++;
          if (output != null) {
            output.collect(shape);
          }
        }
      }
    }
    shapeReader.close();
    return resultCount;
  }
예제 #8
0
 @Test(timeout = 120000)
 public void testSeekAfterSetDropBehind() throws Exception {
   // start a cluster
   LOG.info("testSeekAfterSetDropBehind");
   Configuration conf = new HdfsConfiguration();
   MiniDFSCluster cluster = null;
   String TEST_PATH = "/test";
   int TEST_PATH_LEN = MAX_TEST_FILE_LEN;
   try {
     cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
     cluster.waitActive();
     FileSystem fs = cluster.getFileSystem();
     createHdfsFile(fs, new Path(TEST_PATH), TEST_PATH_LEN, false);
     // verify that we can seek after setDropBehind
     FSDataInputStream fis = fs.open(new Path(TEST_PATH));
     try {
       Assert.assertTrue(fis.read() != -1); // create BlockReader
       fis.setDropBehind(false); // clear BlockReader
       fis.seek(2); // seek
     } finally {
       fis.close();
     }
   } finally {
     if (cluster != null) {
       cluster.shutdown();
     }
   }
 }
예제 #9
0
  // load centroids from HDFS
  static void loadCentroids(
      ArrTable<DoubleArray> cenTable, int vectorSize, String cFileName, Configuration configuration)
      throws IOException {
    Path cPath = new Path(cFileName);
    FileSystem fs = FileSystem.get(configuration);
    FSDataInputStream in = fs.open(cPath);
    BufferedReader br = new BufferedReader(new InputStreamReader(in));
    String line = "";
    String[] vector = null;
    int partitionId = 0;
    while ((line = br.readLine()) != null) {
      vector = line.split("\\s+");
      if (vector.length != vectorSize) {
        System.out.println("Errors while loading centroids .");
        System.exit(-1);
      } else {
        double[] aCen = new double[vectorSize + 1];

        for (int i = 0; i < vectorSize; i++) {
          aCen[i] = Double.parseDouble(vector[i]);
        }
        aCen[vectorSize] = 0;
        ArrPartition<DoubleArray> ap =
            new ArrPartition<DoubleArray>(partitionId, new DoubleArray(aCen, 0, vectorSize + 1));
        cenTable.addPartition(ap);
        partitionId++;
      }
    }
  }
예제 #10
0
  private double[] getSparkModelInfoFromHDFS(Path location, Configuration conf) throws Exception {

    FileSystem fileSystem = FileSystem.get(location.toUri(), conf);
    FileStatus[] files = fileSystem.listStatus(location);

    if (files == null) throw new Exception("Couldn't find Spark Truck ML weights at: " + location);

    ArrayList<Double> modelInfo = new ArrayList<Double>();
    for (FileStatus file : files) {

      if (file.getPath().getName().startsWith("_")) {
        continue;
      }

      InputStream stream = fileSystem.open(file.getPath());

      StringWriter writer = new StringWriter();
      IOUtils.copy(stream, writer, "UTF-8");
      String raw = writer.toString();
      for (String str : raw.split("\n")) {
        modelInfo.add(Double.valueOf(str));
      }
    }

    return Doubles.toArray(modelInfo);
  }
예제 #11
0
  public static void readHiveResult(String path, OutputStreamWriter outStream, Configuration conf)
      throws IOException {
    FileSystem fs = FileSystem.get(conf);
    Path dir = new Path(path);
    if (!fs.exists(dir)) {
      throw new IOException("can not found path:" + path);
    }
    FileStatus[] filelist = fs.listStatus(dir);

    Long bytesRead = 0l;
    long maxsize = 1024l * 1024 * 1024 * 10;

    for (FileStatus f : filelist) {
      if (!f.isDir() && !f.getPath().getName().startsWith("_")) {
        FSDataInputStream in = fs.open(f.getPath());
        BufferedReader bf = new BufferedReader(new InputStreamReader(in));
        String line;
        while ((line = bf.readLine()) != null) {
          bytesRead += line.getBytes().length;
          outStream.write(line.replaceAll("\001", ",").replaceAll("\t", ","));
          outStream.write("\r\n");
          if (bytesRead >= maxsize) {
            bf.close();
            in.close();
            return;
          }
        }
        bf.close();
        in.close();
      }
    }
    return;
  }
예제 #12
0
 /** {@inheritDoc} */
 @Override
 public int open(String path) throws IOException {
   long startTime = System.currentTimeMillis();
   InputStream in = fileSystem.open(new Path(path));
   in.close();
   return (int) (System.currentTimeMillis() - startTime);
 }
예제 #13
0
 /**
  * Sets {@link #mHdfsInputStream} to a stream from the under storage system with the stream
  * starting at position. The {@link #mCurrentPosition} is not modified to be position.
  *
  * @throws IOException if opening the file fails
  */
 private void getHdfsInputStream(long position) throws IOException {
   if (mHdfsInputStream == null) {
     org.apache.hadoop.fs.FileSystem fs = mHdfsPath.getFileSystem(mHadoopConf);
     mHdfsInputStream = fs.open(mHdfsPath, mHadoopBufferSize);
   }
   mHdfsInputStream.seek(position);
 }
 @Override
 public ModelInput<StringBuilder> createInput(
     Class<? extends StringBuilder> dataType,
     FileSystem fileSystem,
     Path path,
     long offset,
     long fragmentSize,
     Counter counter)
     throws IOException, InterruptedException {
   FileSystem fs = FileSystem.get(path.toUri(), getConf());
   FSDataInputStream in = fs.open(path);
   boolean succeed = false;
   try {
     in.seek(offset);
     ModelInput<StringBuilder> result =
         format.createInput(
             dataType, path.toString(), new CountInputStream(in, counter), offset, fragmentSize);
     succeed = true;
     return result;
   } finally {
     if (succeed == false) {
       in.close();
     }
   }
 }
예제 #15
0
  /**
   * Retrieves the centroids between K-means iterations.
   *
   * @return the centroids
   */
  public static long[] getCentroids() throws IOException {
    Configuration conf = setupConf();
    FileSystem fs = FileSystem.get(conf);
    Path path = new Path(BASE_OUTPUT + CENTROID_FILE);
    long[] centroids = new long[4];

    FSDataInputStream in = fs.open(path);
    centroids[0] = Long.parseLong(in.readUTF());
    in.readChar();

    in.readUTF();
    in.readChar();
    centroids[1] = Long.parseLong(in.readUTF());
    in.readChar();
    in.readUTF();
    in.readChar();

    in.readUTF();
    in.readChar();
    centroids[2] = Long.parseLong(in.readUTF());
    in.readChar();
    in.readUTF();
    in.readChar();

    in.readUTF();
    in.readChar();
    centroids[3] = Long.parseLong(in.readUTF());

    in.close();

    return centroids;
  }
예제 #16
0
 public static void createCentersSequenceFile(
     Configuration conf, FileSystem fs, String centroidsPath, String sequenceFilePath)
     throws Exception {
   Path seqFile = new Path(sequenceFilePath);
   if (fs.exists(seqFile)) {
     fs.delete(seqFile, true);
   }
   FSDataInputStream inputStream = fs.open(new Path(centroidsPath));
   SequenceFile.Writer writer =
       SequenceFile.createWriter(fs, conf, seqFile, Centroid.class, IntWritable.class);
   IntWritable value = new IntWritable(0);
   while (inputStream.available() > 0) {
     String line = inputStream.readLine();
     StringTokenizer tokenizer = new StringTokenizer(line, " ");
     int dim = tokenizer.countTokens() - 1;
     int clusterId = Integer.valueOf(tokenizer.nextToken());
     double[] coords = new double[dim];
     for (int i = 0; i < dim; i++) {
       coords[i] = Double.valueOf(tokenizer.nextToken());
     }
     Centroid cluster = new Centroid(clusterId, new Point(coords));
     writer.append(cluster, value);
   }
   IOUtils.closeStream(writer);
   inputStream.close();
 }
예제 #17
0
  private InputStream OpenMultiplePartsWithOffset(FileSystem fs, Path pt, long offset)
      throws IOException {
    RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false);
    Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>();
    while (rit.hasNext()) {
      Path path = rit.next().getPath();
      String filename =
          path.toString().substring(path.getParent().toString().length(), path.toString().length());

      if (filename.startsWith("/part-")) {
        long filesize = fs.getFileStatus(path).getLen();
        if (offset < filesize) {
          FSDataInputStream handle = fs.open(path);
          if (offset > 0) {
            handle.seek(offset);
          }
          fileHandleList.add(handle);
        }
        offset -= filesize;
      }
    }
    if (fileHandleList.size() == 1) return fileHandleList.get(0);
    else if (fileHandleList.size() > 1) {
      Enumeration<FSDataInputStream> enu = fileHandleList.elements();
      return new SequenceInputStream(enu);
    } else {
      System.err.println("Error, no source file loaded. run genSeedDataset.sh fisrt!");
      return null;
    }
  }
예제 #18
0
  public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    Configuration conf = ContextUtil.getConfiguration(context);
    this.maxLineLength = conf.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);

    FileSplit split = (FileSplit) genericSplit;
    start = (split.getStart()) << 16;
    end = (start + split.getLength()) << 16;

    final Path file = split.getPath();
    FileSystem fs = file.getFileSystem(conf);

    bin =
        new BlockCompressedInputStream(
            new WrapSeekable<FSDataInputStream>(
                fs.open(file), fs.getFileStatus(file).getLen(), file));

    in = new LineReader(bin, conf);

    if (start != 0) {
      bin.seek(start);

      // Skip first line
      in.readLine(new Text());
      start = bin.getFilePointer();
    }
    this.pos = start;
  }
예제 #19
0
  // load data form HDFS
  static ArrayList<DoubleArray> loadData(List<String> fileNames, int vectorSize, Configuration conf)
      throws IOException {
    ArrayList<DoubleArray> data = new ArrayList<DoubleArray>();
    for (String filename : fileNames) {
      FileSystem fs = FileSystem.get(conf);
      Path dPath = new Path(filename);
      FSDataInputStream in = fs.open(dPath);
      BufferedReader br = new BufferedReader(new InputStreamReader(in));
      String line = "";
      String[] vector = null;
      while ((line = br.readLine()) != null) {
        vector = line.split("\\s+");

        if (vector.length != vectorSize) {
          System.out.println("Errors while loading data.");
          System.exit(-1);
        } else {
          double[] aDataPoint = new double[vectorSize];

          for (int i = 0; i < vectorSize; i++) {
            aDataPoint[i] = Double.parseDouble(vector[i]);
          }
          DoubleArray da = new DoubleArray(aDataPoint, 0, vectorSize);
          data.add(da);
        }
      }
    }
    return data;
  }
예제 #20
0
  @Override
  public List<String> getContent(String path, int lineCount) throws IOException {
    FileStatus[] files = fileSystem.globStatus(new Path(path));
    ArrayList<String> lines = new ArrayList<String>();

    if (files != null) {
      for (FileStatus file : files) {

        if (lines.size() >= lineCount) {
          break;
        }

        if (!file.isDirectory()) {

          DataInputStream in = fileSystem.open(file.getPath());

          BufferedReader dataReader = new BufferedReader(new InputStreamReader(in));

          String line = dataReader.readLine();
          while (line != null && lines.size() < lineCount) {
            lines.add(line);
            line = dataReader.readLine();
          }

          dataReader.close();
          in.close();
        }
      }
    }
    return lines;
  }
예제 #21
0
  /**
   * Check duplicated tweet IDs in <b>tweetIdDir</b>, and output the duplicates to stdout.
   *
   * @param tweetIdDir
   * @throws Exception
   */
  public static void checkTidDuplicates(String tweetIdDir) throws Exception {
    // First change path strings to URI strings starting with 'file:' or 'hdfs:'
    tweetIdDir = MultiFileFolderWriter.getUriStrForPath(tweetIdDir);

    Set<String> tidSet = new HashSet<String>();
    Configuration conf = HBaseConfiguration.create();
    FileSystem fs = FileSystem.get(new URI(tweetIdDir), conf);
    int dupCount = 0;
    for (FileStatus srcFileStatus : fs.listStatus(new Path(tweetIdDir))) {
      String srcFileName = srcFileStatus.getPath().getName();
      if (srcFileName.endsWith(".txt") && srcFileName.contains("tweetIds")) {
        BufferedReader brTid =
            new BufferedReader(new InputStreamReader(fs.open(srcFileStatus.getPath())));
        String tid = brTid.readLine();
        while (tid != null) {
          if (tidSet.contains(tid)) {
            System.out.println("Duplicated tweet ID: " + tid);
            dupCount++;
          } else {
            tidSet.add(tid);
          }
          tid = brTid.readLine();
        }
        brTid.close();
      }
    }
    System.out.println(
        "Number of unique tweet IDs: " + tidSet.size() + ", number of duplicates: " + dupCount);
  }
  private void loadDocumentIndex(String documentIndexPath) throws IOException {
    if (documentIndex == null) {
      documentIndex = new HashMap<String, Integer>();

      Path p = new Path(documentIndexPath);
      FileSystem fs = FileSystem.get(p.toUri(), new Configuration());
      int index = 0;
      for (FileStatus status : fs.listStatus(p)) {
        Path currPath = status.getPath();
        if (!status.isDir() && !currPath.getName().startsWith("_")) {
          BufferedReader reader = null;
          try {
            reader = new BufferedReader(new InputStreamReader(fs.open(currPath)));
            String line = null;
            while ((line = reader.readLine()) != null) {
              documentIndex.put(line.trim(), index++);
            }
          } finally {
            if (reader != null) {
              reader.close();
            }
          }
        }
      }

      log.info("Loaded document index with size: " + documentIndex.size());
    }
  }
  @Override
  public boolean nextKeyValue() throws IOException, InterruptedException {
    FileSystem fileSystem = FileSystem.get(configuration);

    if (fileSystem.isDirectory(split.getPath())) {
      return false;
    }

    if (fileProcessed) {
      return false;
    }

    int fileLength = (int) split.getLength();
    byte[] result = new byte[fileLength];

    FSDataInputStream inputStream = null;

    try {
      inputStream = fileSystem.open(split.getPath());
      IOUtils.readFully(inputStream, result, 0, fileLength);
      currentValue.set(result, 0, fileLength);
    } finally {
      IOUtils.closeStream(inputStream);
    }
    fileProcessed = true;
    return true;
  }
예제 #24
0
  /**
   * Loads the data from a file
   *
   * @param fs file system
   * @param fpath data file path
   * @throws IOException if any problem is encountered
   */
  public static Data loadData(Dataset dataset, FileSystem fs, Path fpath) throws IOException {
    FSDataInputStream input = fs.open(fpath);
    Scanner scanner = new Scanner(input);

    List<Instance> instances = Lists.newArrayList();

    DataConverter converter = new DataConverter(dataset);

    while (scanner.hasNextLine()) {
      String line = scanner.nextLine();
      if (line.isEmpty()) {
        log.warn("{}: empty string", instances.size());
        continue;
      }

      Instance instance = converter.convert(instances.size(), line);
      if (instance == null) {
        // missing values found
        log.warn("{}: missing values", instances.size());
        continue;
      }

      instances.add(instance);
    }

    scanner.close();

    return new Data(dataset, instances);
  }
예제 #25
0
  @Override
  public void initialize(InputSplit genericSplit, TaskAttemptContext taskAttemptContext)
      throws IOException {
    context = taskAttemptContext;
    FileSplit fileSplit = (FileSplit) genericSplit;
    lzoFile = fileSplit.getPath();
    // The LzoSplitInputFormat is not splittable, so the split length is the whole file.
    totalFileSize = fileSplit.getLength();

    // Jump through some hoops to create the lzo codec.
    Configuration conf = CompatibilityUtil.getConfiguration(context);
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(lzoFile);
    ((Configurable) codec).setConf(conf);

    LzopDecompressor lzopDecompressor = (LzopDecompressor) codec.createDecompressor();
    FileSystem fs = lzoFile.getFileSystem(conf);
    rawInputStream = fs.open(lzoFile);

    // Creating the LzopInputStream here just reads the lzo header for us, nothing more.
    // We do the rest of our input off of the raw stream is.
    codec.createInputStream(rawInputStream, lzopDecompressor);

    // This must be called AFTER createInputStream is called, because createInputStream
    // is what reads the header, which has the checksum information.  Otherwise getChecksumsCount
    // erroneously returns zero, and all block offsets will be wrong.
    numCompressedChecksums = lzopDecompressor.getCompressedChecksumsCount();
    numDecompressedChecksums = lzopDecompressor.getDecompressedChecksumsCount();
  }
예제 #26
0
 static long readHdfsFile(FileSystem fs, Path p, long length, Boolean dropBehind)
     throws Exception {
   FSDataInputStream fis = null;
   long totalRead = 0;
   try {
     fis = fs.open(p);
     if (dropBehind != null) {
       fis.setDropBehind(dropBehind);
     }
     byte buf[] = new byte[8196];
     while (length > 0) {
       int amt = (length > buf.length) ? buf.length : (int) length;
       int ret = fis.read(buf, 0, amt);
       if (ret == -1) {
         return totalRead;
       }
       totalRead += ret;
       length -= ret;
     }
   } catch (IOException e) {
     LOG.error("ioexception", e);
   } finally {
     if (fis != null) {
       fis.close();
     }
   }
   throw new RuntimeException("unreachable");
 }
예제 #27
0
    public QseqRecordReader(Configuration conf, FileSplit split) throws IOException {
      setConf(conf);
      file = split.getPath();
      start = split.getStart();
      end = start + split.getLength();

      FileSystem fs = file.getFileSystem(conf);
      FSDataInputStream fileIn = fs.open(file);

      CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
      CompressionCodec codec = codecFactory.getCodec(file);

      if (codec == null) // no codec.  Uncompressed file.
      {
        positionAtFirstRecord(fileIn);
        inputStream = fileIn;
      } else { // compressed file
        if (start != 0)
          throw new RuntimeException(
              "Start position for compressed file is not 0! (found " + start + ")");

        inputStream = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE; // read until the end of the file
      }

      lineReader = new LineReader(inputStream);
    }
  public PrefixEncodedGlobalStatsWithIndex(Path prefixSetPath, FileSystem fs) throws IOException {
    fileSys = fs;
    FSDataInputStream termsInput = fileSys.open(prefixSetPath);

    prefixSet.readFields(termsInput);
    termsInput.close();
  }
예제 #29
0
  /**
   * Generates the Dataset by parsing the entire data
   *
   * @param descriptor attributes description
   * @param fs file system
   * @param path data path
   */
  public static Dataset generateDataset(String descriptor, FileSystem fs, Path path)
      throws DescriptorException, IOException {
    Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor);

    FSDataInputStream input = fs.open(path);
    Scanner scanner = new Scanner(input);

    // used to convert CATEGORICAL attribute to Integer
    List<String>[] values = new List[attrs.length];

    int id = 0;
    while (scanner.hasNextLine()) {
      String line = scanner.nextLine();
      if (line.isEmpty()) {
        continue;
      }

      if (parseString(id, attrs, values, line) != null) {
        id++;
      }
    }

    scanner.close();

    return new Dataset(attrs, values, id);
  }
예제 #30
0
  public static void downloadHdfs(String srcfilePath, String destFilePath) {
    try {
      Configuration conf = new Configuration();
      FileSystem fs = FileSystem.get(URI.create(srcfilePath), conf);
      FSDataInputStream hdfsInStream = fs.open(new Path(srcfilePath));
      File dstFile = new File(destFilePath);
      if (!dstFile.getParentFile().exists()) {
        dstFile.getParentFile().mkdirs();
      }
      OutputStream out = new FileOutputStream(destFilePath);
      byte[] ioBuffer = new byte[1024];
      int readLen = hdfsInStream.read(ioBuffer);

      while (-1 != readLen) {
        out.write(ioBuffer, 0, readLen);
        readLen = hdfsInStream.read(ioBuffer);
      }
      out.close();
      hdfsInStream.close();
      fs.close();
    } catch (FileNotFoundException e) {
      LOG.error("[downloadHdfs]", e);
    } catch (IOException e) {
      LOG.error("[downloadHdfs]", e);
    }
  }