public static Vocabulary load(Path vocabFile, FileSystem fs) throws IOException { BufferedReader in = new BufferedReader(new InputStreamReader(fs.open(vocabFile))); int size = 0; String line; while ((line = in.readLine()) != null) { String[] columns = StringUtils.tokenize(line, "\t"); int n = Integer.parseInt(columns[1]); if (n > 0) { size++; } } in.close(); in = new BufferedReader(new InputStreamReader(fs.open(vocabFile))); Vocabulary v = new Vocabulary(size); while ((line = in.readLine()) != null) { String[] columns = StringUtils.tokenize(line, "\t"); String tok = columns[0]; int n = Integer.parseInt(columns[1]); if (n > 0) { v.vocab[n - 1] = tok; } } in.close(); return v; }
public static void main(String[] args) throws IOException { String uri = args[0]; Configuration configuration = new Configuration(); System.out.println("Trying to get the file system object"); URI uriObj = URI.create(uri); System.out.println("Got URI object " + uri); FileSystem fs = FileSystem.get(uriObj, configuration); FSDataInputStream fsDataInputStream = null; Path hdfsPath = new Path(uri); fsDataInputStream = fs.open(hdfsPath); // This specifies the reading starts from the 0th Byte. fsDataInputStream.seek(0); IOUtils.copyBytes(fsDataInputStream, System.out, 4096, false); System.out.println("*******************************************"); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(hdfsPath))); try { String line; line = br.readLine(); while (line != null) { System.out.println("################ Line is###### " + line); // be sure to read the next line otherwise you'll get an infinite loop line = br.readLine(); } } finally { // you should close out the BufferedReader br.close(); } }
@Override public void setup(Context context) throws IOException, InterruptedException { LOGGER.info("TopKRollupPhaseOneJob.TopKRollupPhaseOneReducer.setup()"); Configuration configuration = context.getConfiguration(); FileSystem fileSystem = FileSystem.get(configuration); Path configPath = new Path(configuration.get(TOPK_ROLLUP_PHASE1_CONFIG_PATH.toString())); try { starTreeConfig = StarTreeConfig.decode(fileSystem.open(configPath)); config = TopKRollupPhaseOneConfig.fromStarTreeConfig(starTreeConfig); dimensionNames = config.getDimensionNames(); metricTypes = config.getMetricTypes(); metricSchema = new MetricSchema(config.getMetricNames(), metricTypes); metricThresholds = config.getMetricThresholds(); keyWritable = new BytesWritable(); valWritable = new BytesWritable(); MetricSums metricSumsObj = OBJECT_MAPPER.readValue( fileSystem.open( new Path(configuration.get(TOPK_ROLLUP_PHASE1_METRIC_SUMS_PATH.toString()))), MetricSums.class); metricSums = metricSumsObj.getMetricSum(); } catch (Exception e) { throw new IOException(e); } }
private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) { try { if (fs == null) return; for (FileStatus file : fs.listStatus(p)) { Path pfs = file.getPath(); if (file.isDir()) { addFolder(fs, pfs, succeeded, failed); } else { Key k = Key.make(pfs.toString()); long size = file.getLen(); Value val = null; if (pfs.getName().endsWith(Extensions.JSON)) { JsonParser parser = new JsonParser(); JsonObject json = parser.parse(new InputStreamReader(fs.open(pfs))).getAsJsonObject(); JsonElement v = json.get(Constants.VERSION); if (v == null) throw new RuntimeException("Missing version"); JsonElement type = json.get(Constants.TYPE); if (type == null) throw new RuntimeException("Missing type"); Class c = Class.forName(type.getAsString()); OldModel model = (OldModel) c.newInstance(); model.fromJson(json); } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file? FSDataInputStream s = fs.open(pfs); int sz = (int) Math.min(1L << 20, size); // Read up to the 1st meg byte[] mem = MemoryManager.malloc1(sz); s.readFully(mem); // Convert to a ValueArray (hope it fits in 1Meg!) ValueArray ary = new ValueArray(k, 0).read(new AutoBuffer(mem)); val = new Value(k, ary, Value.HDFS); } else if (size >= 2 * ValueArray.CHUNK_SZ) { val = new Value( k, new ValueArray(k, size), Value.HDFS); // ValueArray byte wrapper over a large file } else { val = new Value(k, (int) size, Value.HDFS); // Plain Value val.setdsk(); } DKV.put(k, val); Log.info("PersistHdfs: DKV.put(" + k + ")"); JsonObject o = new JsonObject(); o.addProperty(Constants.KEY, k.toString()); o.addProperty(Constants.FILE, pfs.toString()); o.addProperty(Constants.VALUE_SIZE, file.getLen()); succeeded.add(o); } } } catch (Exception e) { Log.err(e); JsonObject o = new JsonObject(); o.addProperty(Constants.FILE, p.toString()); o.addProperty(Constants.ERROR, e.getMessage()); failed.add(o); } }
/** This tests {@link StringWriter} with non-rolling output. */ @Test public void testNonRollingStringWriter() throws Exception { final int NUM_ELEMENTS = 20; final int PARALLELISM = 2; final String outPath = hdfsURI + "/string-non-rolling-out"; StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(PARALLELISM); DataStream<Tuple2<Integer, String>> source = env.addSource(new TestSourceFunction(NUM_ELEMENTS)).broadcast().filter(new OddEvenFilter()); RollingSink<String> sink = new RollingSink<String>(outPath) .setBucketer(new NonRollingBucketer()) .setPartPrefix("part") .setPendingPrefix("") .setPendingSuffix(""); source .map( new MapFunction<Tuple2<Integer, String>, String>() { private static final long serialVersionUID = 1L; @Override public String map(Tuple2<Integer, String> value) throws Exception { return value.f1; } }) .addSink(sink); env.execute("RollingSink String Write Test"); FSDataInputStream inStream = dfs.open(new Path(outPath + "/part-0-0")); BufferedReader br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 0; i < NUM_ELEMENTS; i += 2) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); inStream = dfs.open(new Path(outPath + "/part-1-0")); br = new BufferedReader(new InputStreamReader(inStream)); for (int i = 1; i < NUM_ELEMENTS; i += 2) { String line = br.readLine(); Assert.assertEquals("message #" + i, line); } inStream.close(); }
/** * Generates the Dataset by parsing the entire data * * @param descriptor attributes description * @param regression if true, the label is numerical * @param fs file system * @param path data path */ public static Dataset generateDataset( CharSequence descriptor, boolean regression, FileSystem fs, Path path) throws DescriptorException, IOException { Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor); FSDataInputStream input = fs.open(path); Scanner scanner = new Scanner(input, "UTF-8"); // used to convert CATEGORICAL attribute to Integer @SuppressWarnings("unchecked") Set<String>[] valsets = new Set[attrs.length]; int size = 0; while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (!line.isEmpty()) { if (parseString(attrs, valsets, line, regression)) { size++; } } } scanner.close(); @SuppressWarnings("unchecked") List<String>[] values = new List[attrs.length]; for (int i = 0; i < valsets.length; i++) { if (valsets[i] != null) { values[i] = Lists.newArrayList(valsets[i]); } } return new Dataset(attrs, values, size, regression); }
/** * Runs a range query on the local machine by iterating over the whole file. * * @param fs - FileSystem that contains input file * @param file - path to the input file * @param queryRange - The range to look in * @param shape - An instance of the shape stored in file * @param output - Output is sent to this collector. If <code>null</code>, output is not collected * and only the number of results is returned. * @return number of results found * @throws IOException */ public static <S extends Shape> long rangeQueryLocal( FileSystem fs, Path file, Shape queryRange, S shape, ResultCollector<S> output) throws IOException { long file_size = fs.getFileStatus(file).getLen(); ShapeRecordReader<S> shapeReader = new ShapeRecordReader<S>(fs.open(file), 0, file_size); long resultCount = 0; Prism cell = shapeReader.createKey(); while (shapeReader.next(cell, shape)) { if (shape.isIntersected(queryRange)) { boolean report_result; if (cell.isValid()) { // Check for duplicate avoidance Prism intersection_mbr = queryRange.getMBR().getIntersection(shape.getMBR()); report_result = cell.contains(intersection_mbr.t1, intersection_mbr.x1, intersection_mbr.y1); } else { report_result = true; } if (report_result) { resultCount++; if (output != null) { output.collect(shape); } } } } shapeReader.close(); return resultCount; }
@Test(timeout = 120000) public void testSeekAfterSetDropBehind() throws Exception { // start a cluster LOG.info("testSeekAfterSetDropBehind"); Configuration conf = new HdfsConfiguration(); MiniDFSCluster cluster = null; String TEST_PATH = "/test"; int TEST_PATH_LEN = MAX_TEST_FILE_LEN; try { cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build(); cluster.waitActive(); FileSystem fs = cluster.getFileSystem(); createHdfsFile(fs, new Path(TEST_PATH), TEST_PATH_LEN, false); // verify that we can seek after setDropBehind FSDataInputStream fis = fs.open(new Path(TEST_PATH)); try { Assert.assertTrue(fis.read() != -1); // create BlockReader fis.setDropBehind(false); // clear BlockReader fis.seek(2); // seek } finally { fis.close(); } } finally { if (cluster != null) { cluster.shutdown(); } } }
// load centroids from HDFS static void loadCentroids( ArrTable<DoubleArray> cenTable, int vectorSize, String cFileName, Configuration configuration) throws IOException { Path cPath = new Path(cFileName); FileSystem fs = FileSystem.get(configuration); FSDataInputStream in = fs.open(cPath); BufferedReader br = new BufferedReader(new InputStreamReader(in)); String line = ""; String[] vector = null; int partitionId = 0; while ((line = br.readLine()) != null) { vector = line.split("\\s+"); if (vector.length != vectorSize) { System.out.println("Errors while loading centroids ."); System.exit(-1); } else { double[] aCen = new double[vectorSize + 1]; for (int i = 0; i < vectorSize; i++) { aCen[i] = Double.parseDouble(vector[i]); } aCen[vectorSize] = 0; ArrPartition<DoubleArray> ap = new ArrPartition<DoubleArray>(partitionId, new DoubleArray(aCen, 0, vectorSize + 1)); cenTable.addPartition(ap); partitionId++; } } }
private double[] getSparkModelInfoFromHDFS(Path location, Configuration conf) throws Exception { FileSystem fileSystem = FileSystem.get(location.toUri(), conf); FileStatus[] files = fileSystem.listStatus(location); if (files == null) throw new Exception("Couldn't find Spark Truck ML weights at: " + location); ArrayList<Double> modelInfo = new ArrayList<Double>(); for (FileStatus file : files) { if (file.getPath().getName().startsWith("_")) { continue; } InputStream stream = fileSystem.open(file.getPath()); StringWriter writer = new StringWriter(); IOUtils.copy(stream, writer, "UTF-8"); String raw = writer.toString(); for (String str : raw.split("\n")) { modelInfo.add(Double.valueOf(str)); } } return Doubles.toArray(modelInfo); }
public static void readHiveResult(String path, OutputStreamWriter outStream, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(conf); Path dir = new Path(path); if (!fs.exists(dir)) { throw new IOException("can not found path:" + path); } FileStatus[] filelist = fs.listStatus(dir); Long bytesRead = 0l; long maxsize = 1024l * 1024 * 1024 * 10; for (FileStatus f : filelist) { if (!f.isDir() && !f.getPath().getName().startsWith("_")) { FSDataInputStream in = fs.open(f.getPath()); BufferedReader bf = new BufferedReader(new InputStreamReader(in)); String line; while ((line = bf.readLine()) != null) { bytesRead += line.getBytes().length; outStream.write(line.replaceAll("\001", ",").replaceAll("\t", ",")); outStream.write("\r\n"); if (bytesRead >= maxsize) { bf.close(); in.close(); return; } } bf.close(); in.close(); } } return; }
/** {@inheritDoc} */ @Override public int open(String path) throws IOException { long startTime = System.currentTimeMillis(); InputStream in = fileSystem.open(new Path(path)); in.close(); return (int) (System.currentTimeMillis() - startTime); }
/** * Sets {@link #mHdfsInputStream} to a stream from the under storage system with the stream * starting at position. The {@link #mCurrentPosition} is not modified to be position. * * @throws IOException if opening the file fails */ private void getHdfsInputStream(long position) throws IOException { if (mHdfsInputStream == null) { org.apache.hadoop.fs.FileSystem fs = mHdfsPath.getFileSystem(mHadoopConf); mHdfsInputStream = fs.open(mHdfsPath, mHadoopBufferSize); } mHdfsInputStream.seek(position); }
@Override public ModelInput<StringBuilder> createInput( Class<? extends StringBuilder> dataType, FileSystem fileSystem, Path path, long offset, long fragmentSize, Counter counter) throws IOException, InterruptedException { FileSystem fs = FileSystem.get(path.toUri(), getConf()); FSDataInputStream in = fs.open(path); boolean succeed = false; try { in.seek(offset); ModelInput<StringBuilder> result = format.createInput( dataType, path.toString(), new CountInputStream(in, counter), offset, fragmentSize); succeed = true; return result; } finally { if (succeed == false) { in.close(); } } }
/** * Retrieves the centroids between K-means iterations. * * @return the centroids */ public static long[] getCentroids() throws IOException { Configuration conf = setupConf(); FileSystem fs = FileSystem.get(conf); Path path = new Path(BASE_OUTPUT + CENTROID_FILE); long[] centroids = new long[4]; FSDataInputStream in = fs.open(path); centroids[0] = Long.parseLong(in.readUTF()); in.readChar(); in.readUTF(); in.readChar(); centroids[1] = Long.parseLong(in.readUTF()); in.readChar(); in.readUTF(); in.readChar(); in.readUTF(); in.readChar(); centroids[2] = Long.parseLong(in.readUTF()); in.readChar(); in.readUTF(); in.readChar(); in.readUTF(); in.readChar(); centroids[3] = Long.parseLong(in.readUTF()); in.close(); return centroids; }
public static void createCentersSequenceFile( Configuration conf, FileSystem fs, String centroidsPath, String sequenceFilePath) throws Exception { Path seqFile = new Path(sequenceFilePath); if (fs.exists(seqFile)) { fs.delete(seqFile, true); } FSDataInputStream inputStream = fs.open(new Path(centroidsPath)); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, seqFile, Centroid.class, IntWritable.class); IntWritable value = new IntWritable(0); while (inputStream.available() > 0) { String line = inputStream.readLine(); StringTokenizer tokenizer = new StringTokenizer(line, " "); int dim = tokenizer.countTokens() - 1; int clusterId = Integer.valueOf(tokenizer.nextToken()); double[] coords = new double[dim]; for (int i = 0; i < dim; i++) { coords[i] = Double.valueOf(tokenizer.nextToken()); } Centroid cluster = new Centroid(clusterId, new Point(coords)); writer.append(cluster, value); } IOUtils.closeStream(writer); inputStream.close(); }
private InputStream OpenMultiplePartsWithOffset(FileSystem fs, Path pt, long offset) throws IOException { RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false); Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>(); while (rit.hasNext()) { Path path = rit.next().getPath(); String filename = path.toString().substring(path.getParent().toString().length(), path.toString().length()); if (filename.startsWith("/part-")) { long filesize = fs.getFileStatus(path).getLen(); if (offset < filesize) { FSDataInputStream handle = fs.open(path); if (offset > 0) { handle.seek(offset); } fileHandleList.add(handle); } offset -= filesize; } } if (fileHandleList.size() == 1) return fileHandleList.get(0); else if (fileHandleList.size() > 1) { Enumeration<FSDataInputStream> enu = fileHandleList.elements(); return new SequenceInputStream(enu); } else { System.err.println("Error, no source file loaded. run genSeedDataset.sh fisrt!"); return null; } }
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { Configuration conf = ContextUtil.getConfiguration(context); this.maxLineLength = conf.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); FileSplit split = (FileSplit) genericSplit; start = (split.getStart()) << 16; end = (start + split.getLength()) << 16; final Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); bin = new BlockCompressedInputStream( new WrapSeekable<FSDataInputStream>( fs.open(file), fs.getFileStatus(file).getLen(), file)); in = new LineReader(bin, conf); if (start != 0) { bin.seek(start); // Skip first line in.readLine(new Text()); start = bin.getFilePointer(); } this.pos = start; }
// load data form HDFS static ArrayList<DoubleArray> loadData(List<String> fileNames, int vectorSize, Configuration conf) throws IOException { ArrayList<DoubleArray> data = new ArrayList<DoubleArray>(); for (String filename : fileNames) { FileSystem fs = FileSystem.get(conf); Path dPath = new Path(filename); FSDataInputStream in = fs.open(dPath); BufferedReader br = new BufferedReader(new InputStreamReader(in)); String line = ""; String[] vector = null; while ((line = br.readLine()) != null) { vector = line.split("\\s+"); if (vector.length != vectorSize) { System.out.println("Errors while loading data."); System.exit(-1); } else { double[] aDataPoint = new double[vectorSize]; for (int i = 0; i < vectorSize; i++) { aDataPoint[i] = Double.parseDouble(vector[i]); } DoubleArray da = new DoubleArray(aDataPoint, 0, vectorSize); data.add(da); } } } return data; }
@Override public List<String> getContent(String path, int lineCount) throws IOException { FileStatus[] files = fileSystem.globStatus(new Path(path)); ArrayList<String> lines = new ArrayList<String>(); if (files != null) { for (FileStatus file : files) { if (lines.size() >= lineCount) { break; } if (!file.isDirectory()) { DataInputStream in = fileSystem.open(file.getPath()); BufferedReader dataReader = new BufferedReader(new InputStreamReader(in)); String line = dataReader.readLine(); while (line != null && lines.size() < lineCount) { lines.add(line); line = dataReader.readLine(); } dataReader.close(); in.close(); } } } return lines; }
/** * Check duplicated tweet IDs in <b>tweetIdDir</b>, and output the duplicates to stdout. * * @param tweetIdDir * @throws Exception */ public static void checkTidDuplicates(String tweetIdDir) throws Exception { // First change path strings to URI strings starting with 'file:' or 'hdfs:' tweetIdDir = MultiFileFolderWriter.getUriStrForPath(tweetIdDir); Set<String> tidSet = new HashSet<String>(); Configuration conf = HBaseConfiguration.create(); FileSystem fs = FileSystem.get(new URI(tweetIdDir), conf); int dupCount = 0; for (FileStatus srcFileStatus : fs.listStatus(new Path(tweetIdDir))) { String srcFileName = srcFileStatus.getPath().getName(); if (srcFileName.endsWith(".txt") && srcFileName.contains("tweetIds")) { BufferedReader brTid = new BufferedReader(new InputStreamReader(fs.open(srcFileStatus.getPath()))); String tid = brTid.readLine(); while (tid != null) { if (tidSet.contains(tid)) { System.out.println("Duplicated tweet ID: " + tid); dupCount++; } else { tidSet.add(tid); } tid = brTid.readLine(); } brTid.close(); } } System.out.println( "Number of unique tweet IDs: " + tidSet.size() + ", number of duplicates: " + dupCount); }
private void loadDocumentIndex(String documentIndexPath) throws IOException { if (documentIndex == null) { documentIndex = new HashMap<String, Integer>(); Path p = new Path(documentIndexPath); FileSystem fs = FileSystem.get(p.toUri(), new Configuration()); int index = 0; for (FileStatus status : fs.listStatus(p)) { Path currPath = status.getPath(); if (!status.isDir() && !currPath.getName().startsWith("_")) { BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(fs.open(currPath))); String line = null; while ((line = reader.readLine()) != null) { documentIndex.put(line.trim(), index++); } } finally { if (reader != null) { reader.close(); } } } } log.info("Loaded document index with size: " + documentIndex.size()); } }
@Override public boolean nextKeyValue() throws IOException, InterruptedException { FileSystem fileSystem = FileSystem.get(configuration); if (fileSystem.isDirectory(split.getPath())) { return false; } if (fileProcessed) { return false; } int fileLength = (int) split.getLength(); byte[] result = new byte[fileLength]; FSDataInputStream inputStream = null; try { inputStream = fileSystem.open(split.getPath()); IOUtils.readFully(inputStream, result, 0, fileLength); currentValue.set(result, 0, fileLength); } finally { IOUtils.closeStream(inputStream); } fileProcessed = true; return true; }
/** * Loads the data from a file * * @param fs file system * @param fpath data file path * @throws IOException if any problem is encountered */ public static Data loadData(Dataset dataset, FileSystem fs, Path fpath) throws IOException { FSDataInputStream input = fs.open(fpath); Scanner scanner = new Scanner(input); List<Instance> instances = Lists.newArrayList(); DataConverter converter = new DataConverter(dataset); while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.isEmpty()) { log.warn("{}: empty string", instances.size()); continue; } Instance instance = converter.convert(instances.size(), line); if (instance == null) { // missing values found log.warn("{}: missing values", instances.size()); continue; } instances.add(instance); } scanner.close(); return new Data(dataset, instances); }
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext taskAttemptContext) throws IOException { context = taskAttemptContext; FileSplit fileSplit = (FileSplit) genericSplit; lzoFile = fileSplit.getPath(); // The LzoSplitInputFormat is not splittable, so the split length is the whole file. totalFileSize = fileSplit.getLength(); // Jump through some hoops to create the lzo codec. Configuration conf = CompatibilityUtil.getConfiguration(context); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(lzoFile); ((Configurable) codec).setConf(conf); LzopDecompressor lzopDecompressor = (LzopDecompressor) codec.createDecompressor(); FileSystem fs = lzoFile.getFileSystem(conf); rawInputStream = fs.open(lzoFile); // Creating the LzopInputStream here just reads the lzo header for us, nothing more. // We do the rest of our input off of the raw stream is. codec.createInputStream(rawInputStream, lzopDecompressor); // This must be called AFTER createInputStream is called, because createInputStream // is what reads the header, which has the checksum information. Otherwise getChecksumsCount // erroneously returns zero, and all block offsets will be wrong. numCompressedChecksums = lzopDecompressor.getCompressedChecksumsCount(); numDecompressedChecksums = lzopDecompressor.getDecompressedChecksumsCount(); }
static long readHdfsFile(FileSystem fs, Path p, long length, Boolean dropBehind) throws Exception { FSDataInputStream fis = null; long totalRead = 0; try { fis = fs.open(p); if (dropBehind != null) { fis.setDropBehind(dropBehind); } byte buf[] = new byte[8196]; while (length > 0) { int amt = (length > buf.length) ? buf.length : (int) length; int ret = fis.read(buf, 0, amt); if (ret == -1) { return totalRead; } totalRead += ret; length -= ret; } } catch (IOException e) { LOG.error("ioexception", e); } finally { if (fis != null) { fis.close(); } } throw new RuntimeException("unreachable"); }
public QseqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException( "Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
public PrefixEncodedGlobalStatsWithIndex(Path prefixSetPath, FileSystem fs) throws IOException { fileSys = fs; FSDataInputStream termsInput = fileSys.open(prefixSetPath); prefixSet.readFields(termsInput); termsInput.close(); }
/** * Generates the Dataset by parsing the entire data * * @param descriptor attributes description * @param fs file system * @param path data path */ public static Dataset generateDataset(String descriptor, FileSystem fs, Path path) throws DescriptorException, IOException { Attribute[] attrs = DescriptorUtils.parseDescriptor(descriptor); FSDataInputStream input = fs.open(path); Scanner scanner = new Scanner(input); // used to convert CATEGORICAL attribute to Integer List<String>[] values = new List[attrs.length]; int id = 0; while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (line.isEmpty()) { continue; } if (parseString(id, attrs, values, line) != null) { id++; } } scanner.close(); return new Dataset(attrs, values, id); }
public static void downloadHdfs(String srcfilePath, String destFilePath) { try { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(srcfilePath), conf); FSDataInputStream hdfsInStream = fs.open(new Path(srcfilePath)); File dstFile = new File(destFilePath); if (!dstFile.getParentFile().exists()) { dstFile.getParentFile().mkdirs(); } OutputStream out = new FileOutputStream(destFilePath); byte[] ioBuffer = new byte[1024]; int readLen = hdfsInStream.read(ioBuffer); while (-1 != readLen) { out.write(ioBuffer, 0, readLen); readLen = hdfsInStream.read(ioBuffer); } out.close(); hdfsInStream.close(); fs.close(); } catch (FileNotFoundException e) { LOG.error("[downloadHdfs]", e); } catch (IOException e) { LOG.error("[downloadHdfs]", e); } }