private List<String> getKeyFromSequenceFile(FileSystem fs, Path path, Configuration conf) throws Exception { List<String> list = new ArrayList<String>(); SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(path)); String next = (String) reader.next((String) null); while (next != null) { list.add(next); next = (String) reader.next((String) null); } reader.close(); return list; }
/* * 将文档中心从hdfs中加载至内存 */ protected void setup(Context context) throws IOException, InterruptedException { // 读取中心点向量数据 Configuration conf = context.getConfiguration(); Path cents = new Path(CENT_PATH); // FileSystem fs = FileSystem.get(conf); FileSystem fs = cents.getFileSystem(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, cents, conf); Text key = new Text(); // 读取题号 Text value = new Text(); // 读取题号对应的单词=TFIDF,单词=TFIDF while (reader.next(key, value)) { Map<String, Double> tfidfAndword = new HashMap<String, Double>(); // 存储TFIDF和单词 String[] strs = null; Pattern p = Pattern.compile("\"([^\"]+)\"=([^,}]+)"); // 正则匹配取出:单词和TFIDF Matcher m = p.matcher(value.toString()); while (m.find()) { strs = m.group().split("="); if (strs.length == 2) { tfidfAndword.put( strs[0].replace("\"", "").trim(), Double.parseDouble(strs[1].replace("}", "").trim())); } } centers.put(key.toString(), tfidfAndword); } reader.close(); super.setup(context); }
/** * Reads key-value pairs from a SequenceFile, up to a maximum number. * * @param path path to file * @param max maximum of key-value pairs to read * @return list of key-value pairs */ @SuppressWarnings("unchecked") public static <K extends Writable, V extends Writable> List<PairOfWritables<K, V>> readFile( Path path, FileSystem fs, int max) throws IOException { List<PairOfWritables<K, V>> list = new ArrayList<PairOfWritables<K, V>>(); try { int k = 0; SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf()); K key; V value; key = (K) reader.getKeyClass().newInstance(); value = (V) reader.getValueClass().newInstance(); while (reader.next(key, value)) { k++; list.add(new PairOfWritables<K, V>(key, value)); if (k >= max) { break; } // Create new objects, because the key, value gets reused key = (K) reader.getKeyClass().newInstance(); value = (V) reader.getValueClass().newInstance(); } reader.close(); } catch (IllegalAccessException e) { throw new RuntimeException("Error reading SequenceFile: " + e); } catch (InstantiationException e) { throw new RuntimeException("Error reading SequenceFile: " + e); } return list; }
/** * return the x*y * * @param url * @return */ public Double[] getR(String url) { List<Double> list = new ArrayList<Double>(); Path path = new Path(url); Configuration conf = HUtils.getConf(); SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader( conf, Reader.file(path), Reader.bufferSize(4096), Reader.start(0)); DoubleArrStrWritable dkey = (DoubleArrStrWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); DoublePairWritable dvalue = (DoublePairWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf); while (reader.next(dkey, dvalue)) { // 循环读取文件 // list.add(dvalue.getSum()*dvalue.getDistance()); } } catch (Exception e) { e.printStackTrace(); } finally { IOUtils.closeStream(reader); } Double[] dList = new Double[list.size()]; dList = list.toArray(dList); Arrays.sort(dList); return dList; }
public static XYSeries getXY(String url) { XYSeries xyseries = new XYSeries(""); Path path = new Path(url); Configuration conf = HUtils.getConf(); SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader( conf, Reader.file(path), Reader.bufferSize(4096), Reader.start(0)); DoubleArrStrWritable dkey = (DoubleArrStrWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); DoublePairWritable dvalue = (DoublePairWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf); while (reader.next(dkey, dvalue)) { // 循环读取文件 xyseries.add(dvalue.getFirst(), dvalue.getSecond()); } } catch (Exception e) { e.printStackTrace(); } finally { IOUtils.closeStream(reader); } return xyseries; }
private int readFile() throws IllegalArgumentException, IOException { int count = 0; final FileSystem fs = FileSystem.get(MapReduceTestUtils.getConfiguration()); final FileStatus[] fss = fs.listStatus( new Path( TestUtils.TEMP_DIR + File.separator + MapReduceTestEnvironment.HDFS_BASE_DIRECTORY + "/t1/pairs")); for (final FileStatus ifs : fss) { if (ifs.isFile() && ifs.getPath().toString().matches(".*part-r-0000[0-9]")) { try (SequenceFile.Reader reader = new SequenceFile.Reader( MapReduceTestUtils.getConfiguration(), Reader.file(ifs.getPath()))) { final Text key = new Text(); final Text val = new Text(); while (reader.next(key, val)) { count++; System.err.println(key + "\t" + val); } } } } return count; }
/** Check whether the file list have duplication. */ private static void checkDuplication(FileSystem fs, Path file, Path sorted, Configuration conf) throws IOException { SequenceFile.Reader in = null; try { SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, new Text.Comparator(), Text.class, Text.class, conf); sorter.sort(file, sorted); in = new SequenceFile.Reader(fs, sorted, conf); Text prevdst = null, curdst = new Text(); Text prevsrc = null, cursrc = new Text(); for (; in.next(curdst, cursrc); ) { if (prevdst != null && curdst.equals(prevdst)) { throw new DuplicationException( "Invalid input, there are duplicated files in the sources: " + prevsrc + ", " + cursrc); } prevdst = curdst; curdst = new Text(); prevsrc = cursrc; cursrc = new Text(); } } finally { checkAndClose(in); } }
/** * Result file contains hierarchy of workerID-resultvar(incl filename). We deduplicate on the * workerID. Without JVM reuse each task refers to a unique workerID, so we will not find any * duplicates. With JVM reuse, however, each slot refers to a workerID, and there are duplicate * filenames due to partial aggregation and overwrite of fname (the RemoteParWorkerMapper ensures * uniqueness of those files independent of the runtime implementation). * * @param job * @param fname * @return * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") public static LocalVariableMap[] readResultFile(JobConf job, String fname) throws DMLRuntimeException, IOException { HashMap<Long, LocalVariableMap> tmp = new HashMap<Long, LocalVariableMap>(); FileSystem fs = FileSystem.get(job); Path path = new Path(fname); LongWritable key = new LongWritable(); // workerID Text value = new Text(); // serialized var header (incl filename) int countAll = 0; for (Path lpath : MatrixReader.getSequenceFilePaths(fs, path)) { SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(job), lpath, job); try { while (reader.next(key, value)) { // System.out.println("key="+key.get()+", value="+value.toString()); if (!tmp.containsKey(key.get())) tmp.put(key.get(), new LocalVariableMap()); Object[] dat = ProgramConverter.parseDataObject(value.toString()); tmp.get(key.get()).put((String) dat[0], (Data) dat[1]); countAll++; } } finally { if (reader != null) reader.close(); } } LOG.debug("Num remote worker results (before deduplication): " + countAll); LOG.debug("Num remote worker results: " + tmp.size()); // create return array return tmp.values().toArray(new LocalVariableMap[0]); }
public void configure(JobConf conf) { numberOfCenters = Integer.valueOf(conf.get("numberOfCenters")); centersDirectory = conf.get("centersReadDirectory"); try { Configuration c = new Configuration(); FileSystem fs = FileSystem.get(c); for (int index = 0; index < numberOfCenters; ++index) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(centersDirectory + "/centers/" + index), c); LongWritable key = new LongWritable(); Point value = new Point(); reader.next(key, value); Point center = (Point) value; centers.add(center); reader.close(); } } catch (IOException e) { // do nothing // I hope this doesn't happen System.out.println("well, damn."); e.printStackTrace(); } }
/** * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ @SuppressWarnings("deprecation") private void readBinaryBlockMatrixBlocksFromHDFS( Path path, JobConf job, FileSystem fs, Collection<IndexedMatrixValue> dest, long rlen, long clen, int brlen, int bclen) throws IOException { MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); // set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); for (Path lpath : getSequenceFilePaths(fs, path)) // 1..N files { // directly read from sequence files (individual partfiles) SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { while (reader.next(key, value)) { int row_offset = (int) (key.getRowIndex() - 1) * brlen; int col_offset = (int) (key.getColumnIndex() - 1) * bclen; int rows = value.getNumRows(); int cols = value.getNumColumns(); // bound check per block if (row_offset + rows < 0 || row_offset + rows > rlen || col_offset + cols < 0 || col_offset + cols > clen) { throw new IOException( "Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } // copy block to result dest.add(new IndexedMatrixValue(new MatrixIndexes(key), new MatrixBlock(value))); } } finally { IOUtilFunctions.closeSilently(reader); } } }
public static void main(String args[]) throws Exception { String inputDir = "reuters"; int k = 25; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); String vectorsFolder = inputDir + "/tfidf-vectors"; SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(vectorsFolder + "/part-r-00000"), conf); List<Vector> points = new ArrayList<Vector>(); Text key = new Text(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { points.add(value.get()); } System.out.println(points.size()); reader.close(); List<Vector> randomPoints = RandomPointsUtil.chooseRandomPoints(points, k); List<Cluster> clusters = new ArrayList<Cluster>(); System.out.println(randomPoints.size()); int clusterId = 0; for (Vector v : randomPoints) { clusters.add(new Cluster(v, clusterId++, new CosineDistanceMeasure())); } List<List<Cluster>> finalClusters = KMeansClusterer.clusterPoints(points, clusters, new CosineDistanceMeasure(), 10, 0.01); for (Cluster cluster : finalClusters.get(finalClusters.size() - 1)) { System.out.println( "Cluster id: " + cluster.getId() + " center: " + cluster.getCenter().asFormatString()); } }
private static void finalize( Configuration conf, JobConf jobconf, final Path destPath, String presevedAttributes) throws IOException { if (presevedAttributes == null) { return; } EnumSet<FileAttribute> preseved = FileAttribute.parse(presevedAttributes); if (!preseved.contains(FileAttribute.USER) && !preseved.contains(FileAttribute.GROUP) && !preseved.contains(FileAttribute.PERMISSION)) { return; } FileSystem dstfs = destPath.getFileSystem(conf); Path dstdirlist = new Path(jobconf.get(DST_DIR_LIST_LABEL)); SequenceFile.Reader in = null; try { in = new SequenceFile.Reader(dstdirlist.getFileSystem(jobconf), dstdirlist, jobconf); Text dsttext = new Text(); FilePair pair = new FilePair(); for (; in.next(dsttext, pair); ) { Path absdst = new Path(destPath, pair.output); updatePermissions(pair.input, dstfs.getFileStatus(absdst), preseved, dstfs); } } finally { checkAndClose(in); } }
@SuppressWarnings("unchecked") public static <V extends Writable> List<V> readValues(Path path, FileSystem fs, int max) { List<V> list = new ArrayList<V>(); try { int k = 0; SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf()); Writable key = (Writable) reader.getKeyClass().newInstance(); V value = (V) reader.getValueClass().newInstance(); while (reader.next(key, value)) { k++; list.add(value); if (k >= max) { break; } value = (V) reader.getValueClass().newInstance(); } reader.close(); } catch (Exception e) { throw new RuntimeException("Error reading SequenceFile " + path); } return list; }
public static Canopies readCanopyCenters(Configuration conf) throws IOException { Canopies canopies = new Canopies(); FileSystem fs = FileSystem.get(conf); Path canopyFileName = new Path(Nasdaq.CANOPY_SEQ_FILE_PATH); // init canopies @SuppressWarnings("deprecation") SequenceFile.Reader reader = new SequenceFile.Reader(fs, canopyFileName, conf); StockVector canopy = new StockVector(); IntWritable value = new IntWritable(); while (reader.next(canopy, value)) { // parse the canopy center StockVector canopyToAdd = new StockVector(canopy); // add to canopy centers canopies.addCanopy(canopyToAdd); } reader.close(); // fs.close(); return canopies; }
public static void run(Configuration conf, Path input, String outputFile) throws IOException, InstantiationException, IllegalAccessException { Writer writer; if (outputFile == null) { writer = new OutputStreamWriter(System.out); } else { writer = new OutputStreamWriter( new FileOutputStream(new File(outputFile)), Charset.forName("UTF-8")); } try { FileSystem fs = input.getFileSystem(conf); for (FileStatus fst : fs.listStatus(input, new DataPathFilter())) { Path dataPath = fst.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, dataPath, conf); try { Text key = reader.getKeyClass().asSubclass(Text.class).newInstance(); DocumentMapping value = new DocumentMapping(); while (reader.next(key, value)) { String docId = value.getDocId(); writer.write(docId + "\t" + key + "\n"); } } finally { reader.close(); } } } finally { writer.close(); } }
/** debugging TODO remove */ private void readOutputFiles(String jobName, Path outDir) throws IOException { FileSystem fs = outDir.getFileSystem(getConf()); if (!fs.getFileStatus(outDir).isDir()) { throw new IOException(outDir.toString() + " is not a directory"); } FileStatus[] files = fs.listStatus(outDir); for (FileStatus f : files) { Path fPath = f.getPath(); if ((!f.isDir()) && (fPath.getName().startsWith(PART_PREFIX))) { LOG.info("opening " + fPath.toString()); SequenceFile.Reader reader = new SequenceFile.Reader(fs, fPath, getConf()); Text key = new Text(); Text value = new Text(); while (reader.next(key, value)) { LOG.info("read " + f.getPath().toString()); LOG.info("read: k=" + key.toString() + " v=" + value.toString()); } LOG.info("done reading " + fPath.toString()); reader.close(); } } }
/** determines which files have failed for a given job */ private Set<String> getFailedFiles(Job job) throws IOException { Set<String> failedFiles = new HashSet<String>(); Path outDir = SequenceFileOutputFormat.getOutputPath(job); FileSystem fs = outDir.getFileSystem(getConf()); if (!fs.getFileStatus(outDir).isDir()) { throw new IOException(outDir.toString() + " is not a directory"); } FileStatus[] files = fs.listStatus(outDir); for (FileStatus f : files) { Path fPath = f.getPath(); if ((!f.isDir()) && (fPath.getName().startsWith(PART_PREFIX))) { LOG.info("opening " + fPath.toString()); SequenceFile.Reader reader = new SequenceFile.Reader(fs, fPath, getConf()); Text key = new Text(); Text value = new Text(); while (reader.next(key, value)) { failedFiles.add(key.toString()); } reader.close(); } } return failedFiles; }
public void performSequenceFileRead(FileSystem fs, int count, Path file) throws IOException { SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf); ByteWritable key = new ByteWritable(); BytesRefArrayWritable val = new BytesRefArrayWritable(); for (int i = 0; i < count; i++) { reader.next(key, val); } }
private List<InputSplit> getSplits( Configuration configuration, int numSplits, long totalSizeBytes) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(numSplits); long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits); CopyListingFileStatus srcFileStatus = new CopyListingFileStatus(); Text srcRelPath = new Text(); long currentSplitSize = 0; long lastSplitStart = 0; long lastPosition = 0; final Path listingFilePath = getListingFilePath(configuration); if (LOG.isDebugEnabled()) { LOG.debug( "Average bytes per map: " + nBytesPerSplit + ", Number of maps: " + numSplits + ", total size: " + totalSizeBytes); } SequenceFile.Reader reader = null; try { reader = getListingFileReader(configuration); while (reader.next(srcRelPath, srcFileStatus)) { // If adding the current file would cause the bytes per map to exceed // limit. Add the current file to new split if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) { FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null); if (LOG.isDebugEnabled()) { LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize); } splits.add(split); lastSplitStart = lastPosition; currentSplitSize = 0; } currentSplitSize += srcFileStatus.getLen(); lastPosition = reader.getPosition(); } if (lastPosition > lastSplitStart) { FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null); if (LOG.isDebugEnabled()) { LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize); } splits.add(split); } } finally { IOUtils.closeStream(reader); } return splits; }
private static ClusterClassifier readClassifier(Configuration config, Path path, FileSystem fs) throws IOException { SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, config); Writable key = new Text(); ClusterClassifier classifierOut = new ClusterClassifier(); try { reader.next(key, classifierOut); } finally { Closeables.closeQuietly(reader); } return classifierOut; }
private static List<IDistanceDensityMul> getIDistanceDensityMulList(String url) throws FileNotFoundException, IOException { Configuration conf = HUtils.getConf(); SequenceFile.Reader reader = null; // 多个文件整合,需排序 List<IDistanceDensityMul> allList = new ArrayList<IDistanceDensityMul>(); // 单个文件 List<IDistanceDensityMul> fileList = new ArrayList<IDistanceDensityMul>(); FileStatus[] fss = HUtils.getHDFSPath(url, "true") .getFileSystem(conf) .listStatus(HUtils.getHDFSPath(url, "true")); for (FileStatus f : fss) { if (!f.toString().contains("part")) { continue; // 排除其他文件 } try { reader = new SequenceFile.Reader( conf, Reader.file(f.getPath()), Reader.bufferSize(4096), Reader.start(0)); // <density_i*min_distancd_j> <first:density_i,second:min_distance_j,third:i> // DoubleWritable, IntDoublePairWritable CustomDoubleWritable dkey = (CustomDoubleWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); IntDoublePairWritable dvalue = (IntDoublePairWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf); int i = Utils.GETDRAWPICRECORDS_EVERYFILE; while (reader.next(dkey, dvalue) && i > 0) { // 循环读取文件 i--; fileList.add( new IDistanceDensityMul( dvalue.getSecond(), dvalue.getFirst(), dvalue.getThird(), dkey.get())); // 每个文件都是从小到大排序的 } } catch (Exception e) { e.printStackTrace(); } finally { IOUtils.closeStream(reader); } // 整合当前文件的前面若干条记录(Utils.GETDRAWPICRECORDS_EVERYFILE ) if (allList.size() <= 0) { // 第一次可以全部添加 allList.addAll(fileList); } else { combineLists(allList, fileList); } } // for // 第一个点太大了,选择去掉 return allList.subList(1, allList.size()); }
private Map<Text, CopyListingFileStatus> getListing(Path listingPath) throws Exception { SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(listingPath)); Text key = new Text(); CopyListingFileStatus value = new CopyListingFileStatus(); Map<Text, CopyListingFileStatus> values = new HashMap<>(); while (reader.next(key, value)) { values.put(key, value); key = new Text(); value = new CopyListingFileStatus(); } return values; }
@Override protected boolean doProcess(Record inputRecord, final InputStream in) throws IOException { SequenceFile.Metadata sequenceFileMetaData = null; SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader( conf, SequenceFile.Reader.stream(new FSDataInputStream(new ForwardOnlySeekable(in)))); if (includeMetaData) { sequenceFileMetaData = reader.getMetadata(); } Class keyClass = reader.getKeyClass(); Class valueClass = reader.getValueClass(); Record template = inputRecord.copy(); removeAttachments(template); while (true) { Writable key = (Writable) ReflectionUtils.newInstance(keyClass, conf); Writable val = (Writable) ReflectionUtils.newInstance(valueClass, conf); try { if (!reader.next(key, val)) { break; } } catch (EOFException ex) { // SequenceFile.Reader will throw an EOFException after reading // all the data, if it doesn't know the length. Since we are // passing in an InputStream, we hit this case; LOG.trace("Received expected EOFException", ex); break; } incrementNumRecords(); Record outputRecord = template.copy(); outputRecord.put(keyField, key); outputRecord.put(valueField, val); outputRecord.put(Fields.ATTACHMENT_MIME_TYPE, OUTPUT_MEDIA_TYPE); if (includeMetaData && sequenceFileMetaData != null) { outputRecord.put(SEQUENCE_FILE_META_DATA, sequenceFileMetaData); } // pass record to next command in chain: if (!getChild().process(outputRecord)) { return false; } } } finally { Closeables.closeQuietly(reader); } return true; }
private static void loadPairs( HMapIV<ArrayListOfIntsWritable> pwsimMapping, int langID, JobConf job, Reporter reporter) { try { Path[] localFiles = DistributedCache.getLocalCacheFiles(job); String pwsimFile = job.get("PwsimPairs"); for (Path localFile : localFiles) { if (localFile.toString().contains(getFilename(pwsimFile))) { SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(job), localFile, job); PairOfInts key = (PairOfInts) reader.getKeyClass().newInstance(); IntWritable value = (IntWritable) reader.getValueClass().newInstance(); int cnt = 0; while (reader.next(key, value)) { int fDocno = key.getRightElement(); int eDocno = key.getLeftElement(); if ((eDocno == 6127 && fDocno == 1000000074) || (eDocno == 6127 && fDocno == 1000000071)) { sLogger.info(key); } if (langID == CLIRUtils.E) { if (!pwsimMapping.containsKey(eDocno)) { pwsimMapping.put(eDocno, new ArrayListOfIntsWritable()); } pwsimMapping .get(eDocno) .add( fDocno); // we add 1000000000 to foreign docnos to distinguish them during // pwsim algo } else { if (!pwsimMapping.containsKey(fDocno)) { pwsimMapping.put(fDocno, new ArrayListOfIntsWritable()); } pwsimMapping .get(fDocno) .add( eDocno); // we add 1000000000 to foreign docnos to distinguish them during // pwsim algo } cnt++; key = (PairOfInts) reader.getKeyClass().newInstance(); value = (IntWritable) reader.getValueClass().newInstance(); } reader.close(); sLogger.info(pwsimMapping.size() + "," + cnt + " pairs loaded from " + localFile); } } } catch (Exception e) { throw new RuntimeException(e); } }
protected Vector fetchVector(Path p, int keyIndex) throws IOException { if (!fs.exists(p)) { return null; } SequenceFile.Reader reader = new SequenceFile.Reader(fs, p, conf); IntWritable key = new IntWritable(); VectorWritable vw = new VectorWritable(); while (reader.next(key, vw)) { if (key.get() == keyIndex) { return vw.get(); } } return null; }
/** * Produce splits such that each is no greater than the quotient of the total size and the * number of splits requested. * * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { int cnfiles = job.getInt(SRC_COUNT_LABEL, -1); long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1); String srcfilelist = job.get(SRC_LIST_LABEL, ""); if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) { throw new RuntimeException( "Invalid metadata: #files(" + cnfiles + ") total_size(" + cbsize + ") listuri(" + srcfilelist + ")"); } Path src = new Path(srcfilelist); FileSystem fs = src.getFileSystem(job); FileStatus srcst = fs.getFileStatus(src); ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); LongWritable key = new LongWritable(); FilePair value = new FilePair(); final long targetsize = cbsize / numSplits; long pos = 0L; long last = 0L; long acc = 0L; long cbrem = srcst.getLen(); SequenceFile.Reader sl = null; try { sl = new SequenceFile.Reader(fs, src, job); for (; sl.next(key, value); last = sl.getPosition()) { // if adding this split would put this split past the target size, // cut the last split and put this next file in the next split. if (acc + key.get() > targetsize && acc != 0) { long splitsize = last - pos; splits.add(new FileSplit(src, pos, splitsize, (String[]) null)); cbrem -= splitsize; pos = last; acc = 0L; } acc += key.get(); } } finally { checkAndClose(sl); } if (cbrem != 0) { splits.add(new FileSplit(src, pos, cbrem, (String[]) null)); } return splits.toArray(new FileSplit[splits.size()]); }
private int getMessageCount(LogFilePath logFilePath) throws Exception { String path = logFilePath.getLogFilePath(); Path fsPath = new Path(path); FileSystem fileSystem = FileUtil.getFileSystem(path); SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, fsPath, new Configuration()); LongWritable key = (LongWritable) reader.getKeyClass().newInstance(); BytesWritable value = (BytesWritable) reader.getValueClass().newInstance(); int result = 0; while (reader.next(key, value)) { result++; } reader.close(); return result; }
private void getOffsets(LogFilePath logFilePath, Set<Long> offsets) throws Exception { String path = logFilePath.getLogFilePath(); Path fsPath = new Path(path); FileSystem fileSystem = FileUtil.getFileSystem(path); SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, fsPath, new Configuration()); LongWritable key = (LongWritable) reader.getKeyClass().newInstance(); BytesWritable value = (BytesWritable) reader.getValueClass().newInstance(); while (reader.next(key, value)) { if (!offsets.add(key.get())) { throw new RuntimeException( "duplicate key " + key.get() + " found in file " + logFilePath.getLogFilePath()); } } reader.close(); }
private static Vector loadVector(Configuration conf, Path path) throws IOException { FileSystem fs = path.getFileSystem(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); Writable key = new IntWritable(); VectorWritable value = new VectorWritable(); try { if (!reader.next(key, value)) { throw new IOException("Input vector file is empty."); } return value.get(); } finally { reader.close(); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); Path centroids = new Path(conf.get(CENTERS_CONF_KEY)); FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader = new SequenceFile.Reader(fs, centroids, conf); Centroid key = new Centroid(); IntWritable value = new IntWritable(); while (reader.next(key, value)) { Centroid clusterCenter = new Centroid(key); centers.add(clusterCenter); } reader.close(); }