@SuppressWarnings("deprecation") private void createControlFile( FileSystem fs, long nrBytes, // in bytes int nrFiles) throws IOException { LOG.info("creating control file: " + nrBytes + " bytes, " + nrFiles + " files"); Path controlDir = getControlDir(config); fs.delete(controlDir, true); for (int i = 0; i < nrFiles; i++) { String name = getFileName(i); Path controlFile = new Path(controlDir, "in_file_" + name); SequenceFile.Writer writer = null; try { writer = SequenceFile.createWriter( fs, config, controlFile, Text.class, LongWritable.class, CompressionType.NONE); writer.append(new Text(name), new LongWritable(nrBytes)); } catch (Exception e) { throw new IOException(e.getLocalizedMessage()); } finally { if (writer != null) writer.close(); writer = null; } } LOG.info("created control files for: " + nrFiles + " files"); }
public static void createCentersSequenceFile( Configuration conf, FileSystem fs, String centroidsPath, String sequenceFilePath) throws Exception { Path seqFile = new Path(sequenceFilePath); if (fs.exists(seqFile)) { fs.delete(seqFile, true); } FSDataInputStream inputStream = fs.open(new Path(centroidsPath)); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, seqFile, Centroid.class, IntWritable.class); IntWritable value = new IntWritable(0); while (inputStream.available() > 0) { String line = inputStream.readLine(); StringTokenizer tokenizer = new StringTokenizer(line, " "); int dim = tokenizer.countTokens() - 1; int clusterId = Integer.valueOf(tokenizer.nextToken()); double[] coords = new double[dim]; for (int i = 0; i < dim; i++) { coords[i] = Double.valueOf(tokenizer.nextToken()); } Centroid cluster = new Centroid(clusterId, new Point(coords)); writer.append(cluster, value); } IOUtils.closeStream(writer); inputStream.close(); }
private static void createControlFile( FileSystem fs, int fileSize, // in MB int nrFiles) throws IOException { LOG.info("creating control file: " + fileSize + " mega bytes, " + nrFiles + " files"); fs.delete(CONTROL_DIR, true); for (int i = 0; i < nrFiles; i++) { String name = getFileName(i); Path controlFile = new Path(CONTROL_DIR, "in_file_" + name); SequenceFile.Writer writer = null; try { writer = SequenceFile.createWriter( fs, fsConfig, controlFile, Text.class, LongWritable.class, CompressionType.NONE); writer.append(new Text(name), new LongWritable(fileSize)); } catch (Exception e) { throw new IOException(e.getLocalizedMessage()); } finally { if (writer != null) writer.close(); writer = null; } } LOG.info("created control files for: " + nrFiles + " files"); }
@Override public boolean write(String uri, InputStream stream) { SequenceFile.Writer writer = getWriterFor(uri); try { int size = stream.available(); byte[] bytes = new byte[size]; int readBytes = stream.read(bytes); if (readBytes != size) { log.error( "Could not read all the bytes from the inputStream. Read " + readBytes + " instead of " + size); return false; } HDFSByteChunk byteChunk = new HDFSByteChunk(bytes, uri); writer.append(new IntWritable(0), byteChunk); writer.close(); return true; } catch (IOException e) { e.printStackTrace(); return false; } }
public static void main(String args[]) throws Exception { if (args.length != 2) { System.err.println("argumentos: dir-de-entrada arquivo-de-saida"); System.exit(1); } FileSystem fs = FileSystem.get(confHadoop); Path inPath = new Path(args[0]); Path outPath = new Path(args[1] + "/dataset"); FSDataInputStream in = null; SequenceFile.Writer writer = null; List<Path> files = listFiles(inPath, jpegFilter); try { writer = SequenceFile.createWriter(fs, confHadoop, outPath, Text.class, BytesWritable.class); for (Path p : files) { in = fs.open(p); byte buffer[] = new byte[in.available()]; in.readFully(buffer); writer.append(new Text(p.getName()), new BytesWritable(buffer)); in.close(); } } finally { IOUtils.closeStream(writer); } }
private void writeSeqenceFileTest( FileSystem fs, int rowCount, Path file, int columnNum, CompressionCodec codec) throws IOException { byte[][] columnRandom; resetRandomGenerators(); BytesRefArrayWritable bytes = new BytesRefArrayWritable(columnNum); columnRandom = new byte[columnNum][]; for (int i = 0; i < columnNum; i++) { BytesRefWritable cu = new BytesRefWritable(); bytes.set(i, cu); } // zero length key is not allowed by block compress writer, so we use a byte // writable ByteWritable key = new ByteWritable(); SequenceFile.Writer seqWriter = SequenceFile.createWriter( fs, conf, file, ByteWritable.class, BytesRefArrayWritable.class, CompressionType.BLOCK, codec); for (int i = 0; i < rowCount; i++) { nextRandomRow(columnRandom, bytes); seqWriter.append(key, bytes); } seqWriter.close(); }
/** * set up input file which has the list of input files. * * @return boolean * @throws IOException */ private boolean setup() throws IOException { estimateSavings(); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobconf); Path jobdir = new Path(jClient.getSystemDir(), NAME + "_" + randomId); LOG.info(JOB_DIR_LABEL + "=" + jobdir); jobconf.set(JOB_DIR_LABEL, jobdir.toString()); Path log = new Path(jobdir, "_logs"); // The control file should have small size blocks. This helps // in spreading out the load from mappers that will be spawned. jobconf.setInt("dfs.blocks.size", OP_LIST_BLOCK_SIZE); FileOutputFormat.setOutputPath(jobconf, log); LOG.info("log=" + log); // create operation list FileSystem fs = jobdir.getFileSystem(jobconf); Path opList = new Path(jobdir, "_" + OP_LIST_LABEL); jobconf.set(OP_LIST_LABEL, opList.toString()); int opCount = 0, synCount = 0; SequenceFile.Writer opWriter = null; try { opWriter = SequenceFile.createWriter( fs, jobconf, opList, Text.class, PolicyInfo.class, SequenceFile.CompressionType.NONE); for (RaidPolicyPathPair p : raidPolicyPathPairList) { // If a large set of files are Raided for the first time, files // in the same directory that tend to have the same size will end up // with the same map. This shuffle mixes things up, allowing a better // mix of files. java.util.Collections.shuffle(p.srcPaths); for (FileStatus st : p.srcPaths) { opWriter.append(new Text(st.getPath().toString()), p.policy); opCount++; if (++synCount > SYNC_FILE_MAX) { opWriter.sync(); synCount = 0; } } } } finally { if (opWriter != null) { opWriter.close(); } fs.setReplication(opList, OP_LIST_REPLICATION); // increase replication for control file } raidPolicyPathPairList.clear(); jobconf.setInt(OP_COUNT_LABEL, opCount); LOG.info("Number of files=" + opCount); jobconf.setNumMapTasks( getMapCount(opCount, new JobClient(jobconf).getClusterStatus().getTaskTrackers())); LOG.info("jobName= " + jobName + " numMapTasks=" + jobconf.getNumMapTasks()); return opCount != 0; }
private void close() throws IOException { for (SequenceFile.Writer writer : writers.values()) { writer.close(); } writers.clear(); LOG.info("closed writer"); }
public static void main(String[] args) throws IOException { // TODO Auto-generated method stub String uri = args[0]; Configuration conf = new Configuration(); Path path = new Path(uri); FileSystem fs = FileSystem.get(URI.create(uri), conf); IntWritable key = new IntWritable(); Text value = new Text(); SequenceFile.Writer writer = null; try { writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass()); for (int i = 0; i < 100; i++) { key.set(100 - i); value.set(DATA[i % DATA.length]); writer.append(key, value); } } finally { IOUtils.closeStream(writer); } }
private void writeToFileListing( SequenceFile.Writer fileListWriter, FileStatus fileStatus, Path sourcePathRoot, boolean localFile) throws IOException { if (fileStatus.getPath().equals(sourcePathRoot) && fileStatus.isDir()) return; // Skip the root-paths. if (LOG.isDebugEnabled()) { LOG.debug( "REL PATH: " + DistCpUtils.getRelativePath(sourcePathRoot, fileStatus.getPath()) + ", FULL PATH: " + fileStatus.getPath()); } FileStatus status = fileStatus; if (localFile) { status = getFileStatus(fileStatus); } fileListWriter.append( new Text(DistCpUtils.getRelativePath(sourcePathRoot, fileStatus.getPath())), status); fileListWriter.sync(); if (!fileStatus.isDir()) { totalBytesToCopy += fileStatus.getLen(); } totalPaths++; }
public static void createControlFile(FileSystem fs, long megaBytes, int numFiles, long seed) throws Exception { LOG.info("creating control file: " + megaBytes + " bytes, " + numFiles + " files"); Path controlFile = new Path(CONTROL_DIR, "files"); fs.delete(controlFile, true); Random random = new Random(seed); SequenceFile.Writer writer = SequenceFile.createWriter( fs, conf, controlFile, Text.class, LongWritable.class, CompressionType.NONE); long totalSize = 0; long maxSize = ((megaBytes / numFiles) * 2) + 1; try { while (totalSize < megaBytes) { Text name = new Text(Long.toString(random.nextLong())); long size = random.nextLong(); if (size < 0) size = -size; size = size % maxSize; // LOG.info(" adding: name="+name+" size="+size); writer.append(name, new LongWritable(size)); totalSize += size; } } finally { writer.close(); } LOG.info("created control file for: " + totalSize + " bytes"); }
/** return a mapping of expected keys -> records */ private HashMap<String, Record> createTextSequenceFile(File file, int numRecords) throws IOException { HashMap<String, Record> map = new HashMap<String, Record>(); SequenceFile.Metadata metadata = new SequenceFile.Metadata(getMetadataForSequenceFile()); FSDataOutputStream out = new FSDataOutputStream(new FileOutputStream(file), null); SequenceFile.Writer writer = null; try { writer = SequenceFile.createWriter( new Configuration(), out, Text.class, Text.class, SequenceFile.CompressionType.NONE, null, metadata); for (int i = 0; i < numRecords; ++i) { Text key = new Text("key" + i); Text value = new Text("value" + i); writer.append(key, value); Record record = new Record(); record.put("key", key); record.put("value", value); map.put(key.toString(), record); } } finally { Closeables.closeQuietly(writer); } return map; }
@SuppressWarnings("deprecation") @Override protected void cleanup(Context context) throws IOException, InterruptedException { super.cleanup(context); List<Cluster> newKMeansClusters = new ArrayList<Cluster>(); List<Cluster> newCanopyClusters = new ArrayList<Cluster>(); for (Cluster kMeansCluster : _clusters.keySet()) { Cluster canopyCluster = _kMeansToCanopyMap.get(kMeansCluster); // Set a new Cluster center Vector center = new Vector(); center.setElements(new double[kMeansCluster.getCenterVector().getElements().length]); List<Vector> vectors = new ArrayList<Vector>(); for (Vector currentVector : _clusters.get(kMeansCluster)) { vectors.add(new Vector(currentVector)); // Sums the vectors to a new vector in order to find the one that is the closest to all // others, it will be our new cluster center. for (int i = 0; i < currentVector.getElements().length; i++) center.getElements()[i] += currentVector.getElements()[i]; } // Divides the vector's elements in order to find its real location (it will be a fictive // vector) for (int i = 0; i < center.getElements().length; i++) center.getElements()[i] = center.getElements()[i] / vectors.size(); Cluster newKMeansCluster = new Cluster(center); canopyCluster.setIsCovered(newKMeansCluster.isConvergedWithOtherCluster(kMeansCluster)); newKMeansClusters.add(newKMeansCluster); newCanopyClusters.add(canopyCluster); // Adding the vectors to the new cluster center for (Vector vector : vectors) { context.write(newKMeansCluster, vector); } } Configuration conf = context.getConfiguration(); Path outPath = new Path(conf.get("centers.path")); FileSystem fs = FileSystem.get(conf); if (fs.exists(outPath)) fs.delete(outPath, true); SequenceFile.Writer writer = SequenceFile.createWriter( fs, context.getConfiguration(), outPath, Cluster.class, Cluster.class); context.getCounter(Counter.CONVERGED).setValue(0); for (int i = 0; i < newKMeansClusters.size(); i++) { writer.append(newCanopyClusters.get(i), newKMeansClusters.get(i)); if (newCanopyClusters.get(i).getIsCovered()) context.getCounter(Counter.CONVERGED).increment(1); } writer.close(); }
private static <T extends WritableComparable> Path writePartitionFile( String testname, JobConf conf, T[] splits) throws IOException { final FileSystem fs = FileSystem.getLocal(conf); final Path testdir = new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(fs); Path p = new Path(testdir, testname + "/_partition.lst"); TotalOrderPartitioner.setPartitionFile(conf, p); conf.setNumReduceTasks(splits.length + 1); SequenceFile.Writer w = null; try { NullWritable nw = NullWritable.get(); w = SequenceFile.createWriter( fs, conf, p, splits[0].getClass(), NullWritable.class, SequenceFile.CompressionType.NONE); for (int i = 0; i < splits.length; ++i) { w.append(splits[i], NullWritable.get()); } } finally { if (null != w) w.close(); } return p; }
public static void writeCanopyCenters(Configuration conf, ArrayList<StockVector> canopyCenters) throws IOException { FileSystem fs = FileSystem.get(conf); IntWritable IntKey = new IntWritable(1); Path canopyFileName = new Path(Nasdaq.CANOPY_SEQ_FILE_PATH); System.out.println("before seq file"); // create file @SuppressWarnings("deprecation") final SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, canopyFileName, StockVector.class, IntWritable.class); System.out.println("after seq file"); System.out.println("canopies" + canopyCenters.size()); for (StockVector canopyCenter : canopyCenters) { // write canopy to file writer.append(canopyCenter, IntKey); System.out.println("sum " + canopyCenter.GetSum()); } System.out.println("canopies end" + canopyCenters.size()); // close writer and file system writer.close(); // fs.close(); }
/** * Create a data file in SequenceFile format that gets exported to the db. * * @param fileNum the number of the file (for multi-file export). * @param numRecords how many records to write to the file. * @param className the table class name to instantiate and populate for each record. */ private void createSequenceFile(int fileNum, int numRecords, String className) throws IOException { try { // Instantiate the value record object via reflection. Class cls = Class.forName(className, true, Thread.currentThread().getContextClassLoader()); SqoopRecord record = (SqoopRecord) ReflectionUtils.newInstance(cls, new Configuration()); // Create the SequenceFile. Configuration conf = new Configuration(); conf.set("fs.default.name", "file:///"); FileSystem fs = FileSystem.get(conf); Path tablePath = getTablePath(); Path filePath = new Path(tablePath, "part" + fileNum); fs.mkdirs(tablePath); SequenceFile.Writer w = SequenceFile.createWriter(fs, conf, filePath, LongWritable.class, cls); // Now write the data. int startId = fileNum * numRecords; for (int i = 0; i < numRecords; i++) { record.parse(getRecordLine(startId + i)); w.append(new LongWritable(startId + i), record); } w.close(); } catch (ClassNotFoundException cnfe) { throw new IOException(cnfe); } catch (RecordParser.ParseError pe) { throw new IOException(pe); } }
/** * Write a partition file for the given job, using the Sampler provided. Queries the sampler for a * sample keyset, sorts by the output key comparator, selects the keys for each rank, and writes * to the destination returned from {@link TotalOrderPartitioner#getPartitionFile}. */ @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = job.getConfiguration(); final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); int numPartitions = job.getNumReduceTasks(); K[] samples = sampler.getSample(inf, job); RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator(); Arrays.sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf)); FileSystem fs = dst.getFileSystem(conf); if (fs.exists(dst)) { fs.delete(dst, false); } SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class); NullWritable nullValue = NullWritable.get(); float stepSize = samples.length / (float) numPartitions; int last = -1; for (int i = 1; i < numPartitions; ++i) { int k = Math.round(stepSize * i); while (last >= k && comparator.compare(samples[last], samples[k]) == 0) { ++k; } writer.append(samples[k], nullValue); last = k; } writer.close(); }
@Before public void setUp() throws Exception { // create local Pig server pigServer = UnitTestUtil.makePigServer(); // create temp SequenceFile File tempFile = File.createTempFile("test", ".txt"); tempFilename = tempFile.getAbsolutePath(); Path path = new Path("file:///" + tempFilename); Configuration conf = new Configuration(); FileSystem fs = path.getFileSystem(conf); IntWritable key = new IntWritable(); Text value = new Text(); SequenceFile.Writer writer = null; try { writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass()); for (int i = 0; i < DATA.length; ++i) { key.set(i); value.set(DATA[i]); writer.append(key, value); } } finally { IOUtils.closeStream(writer); } }
@Override public boolean write(String uri, List<InputStream> streams) { boolean result = false; ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); try { for (InputStream stream : streams) { int size = stream.available(); byte[] bytes = new byte[size]; int readBytes = stream.read(bytes); if (readBytes != size) { log.error( "Could not read all the bytes from the inputStream. Read " + readBytes + " instead of " + size); return false; } outputStream.write(bytes); } HDFSByteChunk byteChunk = new HDFSByteChunk(outputStream.toByteArray(), uri); SequenceFile.Writer writer = getWriterFor(uri); writer.append(new IntWritable(0), byteChunk); writer.close(); } catch (IOException e) { e.printStackTrace(); } return result; }
/** * Write out a SequenceFile that can be read by TotalOrderPartitioner that contains the split * points in startKeys. * * <p>This method was copied from HFileOutputFormat in hbase-0.90.1-cdh3u0. I had to copy it * because it's private. * * @param conf The job configuration. * @param partitionsPath output path for SequenceFile. * @param startKeys the region start keys to use as the partitions. * @throws IOException If there is an error. */ private static void writePartitionFile( Configuration conf, Path partitionsPath, List<HFileKeyValue> startKeys) throws IOException { if (startKeys.isEmpty()) { throw new IllegalArgumentException("No regions passed"); } // We're generating a list of split points, and we don't ever // have keys < the first region (which has an empty start key) // so we need to remove it. Otherwise we would end up with an // empty reducer with index 0. TreeSet<HFileKeyValue> sorted = new TreeSet<HFileKeyValue>(); sorted.addAll(startKeys); HFileKeyValue first = sorted.first(); if (0 != first.getRowKey().length) { throw new IllegalArgumentException( "First region of table should have empty start row key. Instead has: " + Bytes.toStringBinary(first.getRowKey())); } sorted.remove(first); // Write the actual file final SequenceFile.Writer writer = KijiMRPlatformBridge.get() .newSeqFileWriter(conf, partitionsPath, HFileKeyValue.class, NullWritable.class); try { for (HFileKeyValue startKey : sorted) { writer.append(startKey, NullWritable.get()); } } finally { writer.close(); } }
/** creates the input file (containing the names of the files to be fixed */ private List<String> createInputFile( String jobName, Path inDir, Map<String, Integer> corruptFilePriority, int priority) throws IOException { Path file = new Path(inDir, jobName + IN_FILE_SUFFIX); FileSystem fs = file.getFileSystem(getConf()); SequenceFile.Writer fileOut = SequenceFile.createWriter(fs, getConf(), file, LongWritable.class, Text.class); long index = 0L; List<String> filesAdded = new ArrayList<String>(); int count = 0; final long max = filesPerTask * BLOCKFIX_TASKS_PER_JOB; for (Map.Entry<String, Integer> entry : corruptFilePriority.entrySet()) { if (entry.getValue() != priority) { continue; } if (count >= max) { break; } String corruptFileName = entry.getKey(); fileOut.append(new LongWritable(index++), new Text(corruptFileName)); filesAdded.add(corruptFileName); count++; if (index % filesPerTask == 0) { fileOut.sync(); // create sync point to make sure we can split here } } fileOut.close(); return filesAdded; }
/** * @param state The final LanczosState to be serialized * @param outputPath The path (relative to the current Configuration's FileSystem) to save the * output to. */ public void serializeOutput(LanczosState state, Path outputPath) throws IOException { int numEigenVectors = state.getIterationNumber(); log.info("Persisting {} eigenVectors and eigenValues to: {}", numEigenVectors, outputPath); Configuration conf = getConf() != null ? getConf() : new Configuration(); FileSystem fs = FileSystem.get(outputPath.toUri(), conf); SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, outputPath, IntWritable.class, VectorWritable.class); try { IntWritable iw = new IntWritable(); for (int i = 0; i < numEigenVectors; i++) { // Persist eigenvectors sorted by eigenvalues in descending order\ NamedVector v = new NamedVector( state.getRightSingularVector(numEigenVectors - 1 - i), "eigenVector" + i + ", eigenvalue = " + state.getSingularValue(numEigenVectors - 1 - i)); Writable vw = new VectorWritable(v); iw.set(i); seqWriter.append(iw, vw); } } finally { Closeables.close(seqWriter, false); } }
@Override public boolean writeData(String uri, byte[] data) { /* * Delete the parent folder if the parent folder exists * */ File f = new File(uri); if (f.getName().equals(storageConfiguration.getProperty("postfix"))) { f = f.getParentFile(); Path file = new Path(String.valueOf(f)); try { if (fileSystem.exists(file)) { fileSystem.delete(file, true); } } catch (IOException e) { e.printStackTrace(); } } SequenceFile.Writer writer = getWriterFor(uri); HDFSByteChunk byteChunk = new HDFSByteChunk(data, uri); try { writer.append(new IntWritable(0), byteChunk); writer.close(); } catch (IOException e) { e.printStackTrace(); return false; } return true; }
/** * Read the document frequency List which is built at the end of the DF Count Job. This will use * constant memory and will run at the speed of your disk read */ private static Pair<Long[], List<Path>> createDictionaryChunks( Path featureCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes) throws IOException { List<Path> chunkPaths = Lists.newArrayList(); Configuration conf = new Configuration(baseConf); FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf); long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L; int chunkIndex = 0; Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); SequenceFile.Writer freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); try { long currentChunkSize = 0; long featureCount = 0; long vectorCount = Long.MAX_VALUE; Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN); for (Pair<IntWritable, LongWritable> record : new SequenceFileDirIterable<IntWritable, LongWritable>( filesPattern, PathType.GLOB, null, null, true, conf)) { if (currentChunkSize > chunkSizeLimit) { Closeables.close(freqWriter, false); chunkIndex++; chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex); chunkPaths.add(chunkPath); freqWriter = new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class); currentChunkSize = 0; } int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8; currentChunkSize += fieldSize; IntWritable key = record.getFirst(); LongWritable value = record.getSecond(); if (key.get() >= 0) { freqWriter.append(key, value); } else if (key.get() == -1) { vectorCount = value.get(); } featureCount = Math.max(key.get(), featureCount); } featureCount++; Long[] counts = {featureCount, vectorCount}; return new Pair<Long[], List<Path>>(counts, chunkPaths); } finally { Closeables.close(freqWriter, false); } }
public static Job createTimesSquaredJob( Configuration initialConf, Vector v, int outputVectorDim, Path matrixInputPath, Path outputVectorPathBase, Class<? extends TimesSquaredMapper> mapClass, Class<? extends VectorSummingReducer> redClass) throws IOException { FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf); matrixInputPath = fs.makeQualified(matrixInputPath); outputVectorPathBase = fs.makeQualified(outputVectorPathBase); long now = System.nanoTime(); Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now); SequenceFile.Writer inputVectorPathWriter = null; try { inputVectorPathWriter = new SequenceFile.Writer( fs, initialConf, inputVectorPath, NullWritable.class, VectorWritable.class); inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v)); } finally { Closeables.close(inputVectorPathWriter, false); } URI ivpURI = inputVectorPath.toUri(); DistributedCache.setCacheFiles(new URI[] {ivpURI}, initialConf); Job job = HadoopUtil.prepareJob( matrixInputPath, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME), SequenceFileInputFormat.class, mapClass, NullWritable.class, VectorWritable.class, redClass, NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf); job.setCombinerClass(redClass); job.setJobName("TimesSquaredJob: " + matrixInputPath); Configuration conf = job.getConfiguration(); conf.set(INPUT_VECTOR, ivpURI.toString()); conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense()); conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim); return job; }
private static Path saveVector(Configuration conf, Path path, Vector v) throws IOException { FileSystem fs = path.getFileSystem(conf); SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class); try { writer.append(new IntWritable(0), new VectorWritable(v)); } finally { writer.close(); } return path; }
private void write(Object part, int id, Vector vector) throws IOException { SequenceFile.Writer writer = writers.get(part); if (writer == null) { Configuration conf = UDFContext.getUDFContext().getJobConf(); Path file = PathUtils.enter(getStorePath(), String.valueOf(part), "part-" + Env.getPartID()); writer = IOUtils.forSequenceWrite(conf, file, IntWritable.class, VectorWritable.class); writers.put(part, writer); } keyWritable.set(id); valueWritable.set(vector); writer.append(keyWritable, valueWritable); }
private static void writeClassifier( ClusterClassifier classifier, Configuration config, Path path, FileSystem fs) throws IOException { SequenceFile.Writer writer = new SequenceFile.Writer(fs, config, path, Text.class, ClusterClassifier.class); Writable key = new Text("test"); try { writer.append(key, classifier); } finally { Closeables.closeQuietly(writer); } }
/** Reduce task done, write output to a file. */ @Override public void close() throws IOException { // write output to a file Path outDir = new Path(TMP_DIR, "out"); Path outFile = new Path(outDir, "reduce-out"); FileSystem fileSys = FileSystem.get(conf); SequenceFile.Writer writer = SequenceFile.createWriter( fileSys, conf, outFile, LongWritable.class, LongWritable.class, CompressionType.NONE); writer.append(new LongWritable(numInside), new LongWritable(numOutside)); writer.close(); }
public static void toSequenceFile(String fileName, Collection<String> pdbIds, boolean verbose) throws IOException { int failure = 0; int success = 0; int chains = 0; try (SequenceFile.Writer writer = SequenceFile.createWriter( new Configuration(), SequenceFile.Writer.file(new Path(fileName)), SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(IntArrayWritable.class), SequenceFile.Writer.compression( SequenceFile.CompressionType.BLOCK, new BZip2Codec())); ) { for (String pdbId : pdbIds) { if (verbose) { System.out.println(pdbId); } Structure s = null; try { s = StructureIO.getStructure(pdbId); success++; } catch (Exception e) { // some files can't be read. Let's just skip those! e.printStackTrace(); failure++; continue; } if (s == null) { System.err.println("structure null: " + pdbId); continue; } if (s.getChains().size() == 0) { continue; } chains += append(writer, pdbId, s); } IOUtils.closeStream(writer); } if (verbose) { System.out.println("Total structures: " + pdbIds.size()); System.out.println("Success: " + success); System.out.println("Failure: " + failure); System.out.println("Chains: " + chains); } }