@SuppressWarnings("unchecked") public static <K extends Writable> List<K> readKeys(Path path, FileSystem fs, int max) { List<K> list = new ArrayList<K>(); try { int k = 0; SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf()); K key = (K) reader.getKeyClass().newInstance(); Writable value = (Writable) reader.getValueClass().newInstance(); while (reader.next(key, value)) { k++; list.add(key); if (k >= max) { break; } key = (K) reader.getKeyClass().newInstance(); } reader.close(); } catch (Exception e) { throw new RuntimeException("Error reading SequenceFile " + path); } return list; }
/** * Reads key-value pairs from a SequenceFile, up to a maximum number. * * @param path path to file * @param max maximum of key-value pairs to read * @return list of key-value pairs */ @SuppressWarnings("unchecked") public static <K extends Writable, V extends Writable> List<PairOfWritables<K, V>> readFile( Path path, FileSystem fs, int max) throws IOException { List<PairOfWritables<K, V>> list = new ArrayList<PairOfWritables<K, V>>(); try { int k = 0; SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf()); K key; V value; key = (K) reader.getKeyClass().newInstance(); value = (V) reader.getValueClass().newInstance(); while (reader.next(key, value)) { k++; list.add(new PairOfWritables<K, V>(key, value)); if (k >= max) { break; } // Create new objects, because the key, value gets reused key = (K) reader.getKeyClass().newInstance(); value = (V) reader.getValueClass().newInstance(); } reader.close(); } catch (IllegalAccessException e) { throw new RuntimeException("Error reading SequenceFile: " + e); } catch (InstantiationException e) { throw new RuntimeException("Error reading SequenceFile: " + e); } return list; }
private static void loadPairs( HMapIV<ArrayListOfIntsWritable> pwsimMapping, int langID, JobConf job, Reporter reporter) { try { Path[] localFiles = DistributedCache.getLocalCacheFiles(job); String pwsimFile = job.get("PwsimPairs"); for (Path localFile : localFiles) { if (localFile.toString().contains(getFilename(pwsimFile))) { SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.getLocal(job), localFile, job); PairOfInts key = (PairOfInts) reader.getKeyClass().newInstance(); IntWritable value = (IntWritable) reader.getValueClass().newInstance(); int cnt = 0; while (reader.next(key, value)) { int fDocno = key.getRightElement(); int eDocno = key.getLeftElement(); if ((eDocno == 6127 && fDocno == 1000000074) || (eDocno == 6127 && fDocno == 1000000071)) { sLogger.info(key); } if (langID == CLIRUtils.E) { if (!pwsimMapping.containsKey(eDocno)) { pwsimMapping.put(eDocno, new ArrayListOfIntsWritable()); } pwsimMapping .get(eDocno) .add( fDocno); // we add 1000000000 to foreign docnos to distinguish them during // pwsim algo } else { if (!pwsimMapping.containsKey(fDocno)) { pwsimMapping.put(fDocno, new ArrayListOfIntsWritable()); } pwsimMapping .get(fDocno) .add( eDocno); // we add 1000000000 to foreign docnos to distinguish them during // pwsim algo } cnt++; key = (PairOfInts) reader.getKeyClass().newInstance(); value = (IntWritable) reader.getValueClass().newInstance(); } reader.close(); sLogger.info(pwsimMapping.size() + "," + cnt + " pairs loaded from " + localFile); } } } catch (Exception e) { throw new RuntimeException(e); } }
public static void run(Configuration conf, Path input, String outputFile) throws IOException, InstantiationException, IllegalAccessException { Writer writer; if (outputFile == null) { writer = new OutputStreamWriter(System.out); } else { writer = new OutputStreamWriter( new FileOutputStream(new File(outputFile)), Charset.forName("UTF-8")); } try { FileSystem fs = input.getFileSystem(conf); for (FileStatus fst : fs.listStatus(input, new DataPathFilter())) { Path dataPath = fst.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, dataPath, conf); try { Text key = reader.getKeyClass().asSubclass(Text.class).newInstance(); DocumentMapping value = new DocumentMapping(); while (reader.next(key, value)) { String docId = value.getDocId(); writer.write(docId + "\t" + key + "\n"); } } finally { reader.close(); } } } finally { writer.close(); } }
/** * return the x*y * * @param url * @return */ public Double[] getR(String url) { List<Double> list = new ArrayList<Double>(); Path path = new Path(url); Configuration conf = HUtils.getConf(); SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader( conf, Reader.file(path), Reader.bufferSize(4096), Reader.start(0)); DoubleArrStrWritable dkey = (DoubleArrStrWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); DoublePairWritable dvalue = (DoublePairWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf); while (reader.next(dkey, dvalue)) { // 循环读取文件 // list.add(dvalue.getSum()*dvalue.getDistance()); } } catch (Exception e) { e.printStackTrace(); } finally { IOUtils.closeStream(reader); } Double[] dList = new Double[list.size()]; dList = list.toArray(dList); Arrays.sort(dList); return dList; }
public static XYSeries getXY(String url) { XYSeries xyseries = new XYSeries(""); Path path = new Path(url); Configuration conf = HUtils.getConf(); SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader( conf, Reader.file(path), Reader.bufferSize(4096), Reader.start(0)); DoubleArrStrWritable dkey = (DoubleArrStrWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); DoublePairWritable dvalue = (DoublePairWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf); while (reader.next(dkey, dvalue)) { // 循环读取文件 xyseries.add(dvalue.getFirst(), dvalue.getSecond()); } } catch (Exception e) { e.printStackTrace(); } finally { IOUtils.closeStream(reader); } return xyseries; }
public SeqFileInputStream(FileSystem fs, FileStatus f) throws IOException { r = new SequenceFile.Reader(fs, f.getPath(), getConf()); key = ReflectionUtils.newInstance( r.getKeyClass().asSubclass(WritableComparable.class), getConf()); val = ReflectionUtils.newInstance(r.getValueClass().asSubclass(Writable.class), getConf()); inbuf = new DataInputBuffer(); outbuf = new DataOutputBuffer(); }
private static int readSequenceFile(Path path, FileSystem fs, int max) throws IOException { SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf()); System.out.println("Reading " + path + "...\n"); try { System.out.println("Key type: " + reader.getKeyClass().toString()); System.out.println("Value type: " + reader.getValueClass().toString() + "\n"); } catch (Exception e) { throw new RuntimeException("Error: loading key/value class"); } Writable key = null, value; int n = 0; try { if (Tuple.class.isAssignableFrom(reader.getKeyClass())) { key = TUPLE_FACTORY.newTuple(); } else { key = (Writable) reader.getKeyClass().newInstance(); } if (Tuple.class.isAssignableFrom(reader.getValueClass())) { value = TUPLE_FACTORY.newTuple(); } else { value = (Writable) reader.getValueClass().newInstance(); } while (reader.next(key, value)) { System.out.println("Record " + n); System.out.println("Key: " + key + "\nValue: " + value); System.out.println("----------------------------------------"); n++; if (n >= max) break; } reader.close(); System.out.println(n + " records read.\n"); } catch (Exception e) { e.printStackTrace(); } return n; }
private static List<IDistanceDensityMul> getIDistanceDensityMulList(String url) throws FileNotFoundException, IOException { Configuration conf = HUtils.getConf(); SequenceFile.Reader reader = null; // 多个文件整合,需排序 List<IDistanceDensityMul> allList = new ArrayList<IDistanceDensityMul>(); // 单个文件 List<IDistanceDensityMul> fileList = new ArrayList<IDistanceDensityMul>(); FileStatus[] fss = HUtils.getHDFSPath(url, "true") .getFileSystem(conf) .listStatus(HUtils.getHDFSPath(url, "true")); for (FileStatus f : fss) { if (!f.toString().contains("part")) { continue; // 排除其他文件 } try { reader = new SequenceFile.Reader( conf, Reader.file(f.getPath()), Reader.bufferSize(4096), Reader.start(0)); // <density_i*min_distancd_j> <first:density_i,second:min_distance_j,third:i> // DoubleWritable, IntDoublePairWritable CustomDoubleWritable dkey = (CustomDoubleWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); IntDoublePairWritable dvalue = (IntDoublePairWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf); int i = Utils.GETDRAWPICRECORDS_EVERYFILE; while (reader.next(dkey, dvalue) && i > 0) { // 循环读取文件 i--; fileList.add( new IDistanceDensityMul( dvalue.getSecond(), dvalue.getFirst(), dvalue.getThird(), dkey.get())); // 每个文件都是从小到大排序的 } } catch (Exception e) { e.printStackTrace(); } finally { IOUtils.closeStream(reader); } // 整合当前文件的前面若干条记录(Utils.GETDRAWPICRECORDS_EVERYFILE ) if (allList.size() <= 0) { // 第一次可以全部添加 allList.addAll(fileList); } else { combineLists(allList, fileList); } } // for // 第一个点太大了,选择去掉 return allList.subList(1, allList.size()); }
@Override protected boolean doProcess(Record inputRecord, final InputStream in) throws IOException { SequenceFile.Metadata sequenceFileMetaData = null; SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader( conf, SequenceFile.Reader.stream(new FSDataInputStream(new ForwardOnlySeekable(in)))); if (includeMetaData) { sequenceFileMetaData = reader.getMetadata(); } Class keyClass = reader.getKeyClass(); Class valueClass = reader.getValueClass(); Record template = inputRecord.copy(); removeAttachments(template); while (true) { Writable key = (Writable) ReflectionUtils.newInstance(keyClass, conf); Writable val = (Writable) ReflectionUtils.newInstance(valueClass, conf); try { if (!reader.next(key, val)) { break; } } catch (EOFException ex) { // SequenceFile.Reader will throw an EOFException after reading // all the data, if it doesn't know the length. Since we are // passing in an InputStream, we hit this case; LOG.trace("Received expected EOFException", ex); break; } incrementNumRecords(); Record outputRecord = template.copy(); outputRecord.put(keyField, key); outputRecord.put(valueField, val); outputRecord.put(Fields.ATTACHMENT_MIME_TYPE, OUTPUT_MEDIA_TYPE); if (includeMetaData && sequenceFileMetaData != null) { outputRecord.put(SEQUENCE_FILE_META_DATA, sequenceFileMetaData); } // pass record to next command in chain: if (!getChild().process(outputRecord)) { return false; } } } finally { Closeables.closeQuietly(reader); } return true; }
private int getMessageCount(LogFilePath logFilePath) throws Exception { String path = logFilePath.getLogFilePath(); Path fsPath = new Path(path); FileSystem fileSystem = FileUtil.getFileSystem(path); SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, fsPath, new Configuration()); LongWritable key = (LongWritable) reader.getKeyClass().newInstance(); BytesWritable value = (BytesWritable) reader.getValueClass().newInstance(); int result = 0; while (reader.next(key, value)) { result++; } reader.close(); return result; }
public static void main(String[] args) throws Exception { // TODO Auto-generated method stub String mapUri = args[0]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(mapUri), conf); Path map = new Path(mapUri); Path mapData = new Path(mapUri, MapFile.DATA_FILE_NAME); SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(mapData)); Class keyClass = reader.getKeyClass(); Class valueClass = reader.getValueClass(); reader.close(); long entries = MapFile.fix(fs, map, keyClass, valueClass, false, conf); System.out.printf("Created MapFile %s with %d entries\n", map, entries); }
private void getOffsets(LogFilePath logFilePath, Set<Long> offsets) throws Exception { String path = logFilePath.getLogFilePath(); Path fsPath = new Path(path); FileSystem fileSystem = FileUtil.getFileSystem(path); SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, fsPath, new Configuration()); LongWritable key = (LongWritable) reader.getKeyClass().newInstance(); BytesWritable value = (BytesWritable) reader.getValueClass().newInstance(); while (reader.next(key, value)) { if (!offsets.add(key.get())) { throw new RuntimeException( "duplicate key " + key.get() + " found in file " + logFilePath.getLogFilePath()); } } reader.close(); }
private MedoidSet readMedoidsSet(Path input, Configuration config) throws IOException, IllegalAccessException, InstantiationException { FileSystem fs = FileSystem.get(config); SequenceFile.Reader reader = new SequenceFile.Reader(fs, input, config); MedoidSet set = new MedoidSet(); try { Writable key = (Writable) reader.getKeyClass().newInstance(); reader.next(key, set); } finally { IOUtils.quietClose(reader); } log.debug("Read initial medoid set:" + set); return set; }
private static Map<Integer, List<WeightedVectorWritable>> readPoints( Path pointsPathDir, Configuration conf) throws IOException { Map<Integer, List<WeightedVectorWritable>> result = new TreeMap<Integer, List<WeightedVectorWritable>>(); FileSystem fs = pointsPathDir.getFileSystem(conf); FileStatus[] children = fs.listStatus( pointsPathDir, new PathFilter() { @Override public boolean accept(Path path) { return !(path.getName().endsWith(".crc") || "_logs".equals(path.getName())); } }); for (FileStatus file : children) { Path path = file.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); try { IntWritable key = reader.getKeyClass().asSubclass(IntWritable.class).newInstance(); WeightedVectorWritable value = reader.getValueClass().asSubclass(WeightedVectorWritable.class).newInstance(); while (reader.next(key, value)) { // value is the cluster id as an int, key is the name/id of the // vector, but that doesn't matter because we only care about printing // it // String clusterId = value.toString(); List<WeightedVectorWritable> pointList = result.get(key.get()); if (pointList == null) { pointList = new ArrayList<WeightedVectorWritable>(); result.put(key.get(), pointList); } pointList.add(value); value = reader.getValueClass().asSubclass(WeightedVectorWritable.class).newInstance(); } } catch (InstantiationException e) { log.error("Exception", e); } catch (IllegalAccessException e) { log.error("Exception", e); } } return result; }
@Override public long next(HdfsInputStream hdfsistr, Holder<Object> key, Holder<Object> value) { try { SequenceFile.Reader reader = (SequenceFile.Reader) hdfsistr.getIn(); Holder<Integer> keySize = new Holder<Integer>(); Writable keyWritable = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), new Configuration()); Holder<Integer> valueSize = new Holder<Integer>(); Writable valueWritable = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), new Configuration()); if (reader.next(keyWritable, valueWritable)) { key.value = getObject(keyWritable, keySize); value.value = getObject(valueWritable, valueSize); return keySize.value + valueSize.value; } else { return 0; } } catch (Exception ex) { throw new RuntimeCamelException(ex); } }
public void printClusters(String[] dictionary) throws IOException, InstantiationException, IllegalAccessException { Configuration conf = new Configuration(); if (this.termDictionary != null) { if ("text".equals(dictionaryFormat)) { dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary)); } else if ("sequencefile".equals(dictionaryFormat)) { FileSystem fs = FileSystem.get(new Path(this.termDictionary).toUri(), conf); dictionary = VectorHelper.loadTermDictionary(conf, fs, this.termDictionary); } else { throw new IllegalArgumentException("Invalid dictionary format"); } } Writer writer = this.outputFile == null ? new OutputStreamWriter(System.out) : new FileWriter(this.outputFile); try { FileSystem fs = seqFileDir.getFileSystem(conf); for (FileStatus seqFile : fs.globStatus(new Path(seqFileDir, "part-*"))) { Path path = seqFile.getPath(); // System.out.println("Input Path: " + path); doesn't this interfere with output? SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); try { Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance(); Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance(); while (reader.next(key, value)) { Cluster cluster = (Cluster) value; String fmtStr = useJSON ? cluster.asJsonString() : cluster.asFormatString(dictionary); if (subString > 0 && fmtStr.length() > subString) { writer.write(':'); writer.write(fmtStr, 0, Math.min(subString, fmtStr.length())); } else { writer.write(fmtStr); } writer.write('\n'); if (dictionary != null) { String topTerms = getTopFeatures(cluster.getCenter(), dictionary, numTopFeatures); writer.write("\tTop Terms: "); writer.write(topTerms); writer.write('\n'); } List<WeightedVectorWritable> points = clusterIdToPoints.get(cluster.getId()); if (points != null) { writer.write("\tWeight: Point:\n\t"); for (Iterator<WeightedVectorWritable> iterator = points.iterator(); iterator.hasNext(); ) { WeightedVectorWritable point = iterator.next(); writer.write(String.valueOf(point.getWeight())); writer.write(": "); writer.write(AbstractCluster.formatVector(point.getVector(), dictionary)); if (iterator.hasNext()) { writer.write("\n\t"); } } writer.write('\n'); } } } finally { reader.close(); } } } finally { writer.close(); } }
protected void sequenceCrush(FileSystem fs, FileStatus[] status) throws IOException, CrushException { l4j.info("Sequence file crushing activated"); Class keyClass = null; Class valueClass = null; SequenceFile.Writer writer = null; for (FileStatus stat : status) { if (reporter != null) { reporter.setStatus("Crushing on " + stat.getPath()); l4j.info("Current file " + stat.getPath()); l4j.info("length " + stat.getLen()); reporter.incrCounter(CrushMapper.CrushCounters.FILES_CRUSHED, 1); } Path p1 = stat.getPath(); SequenceFile.Reader read = new SequenceFile.Reader(fs, p1, jobConf); if (keyClass == null) { keyClass = read.getKeyClass(); valueClass = read.getValueClass(); writer = SequenceFile.createWriter( fs, jobConf, outPath, keyClass, valueClass, this.compressionType, this.codec); } else { if (!(keyClass.equals(read.getKeyClass()) && valueClass.equals(read.getValueClass()))) { read.close(); writer.close(); throw new CrushException( "File " + stat.getPath() + " keyClass " + read.getKeyClass() + " valueClass " + read.getValueClassName() + " does not match" + " other files in folder"); } } Writable k = (Writable) ReflectionUtils.newInstance(keyClass, jobConf); Writable v = (Writable) ReflectionUtils.newInstance(valueClass, jobConf); int rowCount = 0; while (read.next(k, v)) { writer.append(k, v); rowCount++; if (rowCount % 100000 == 0) { if (reporter != null) { reporter.setStatus(stat + " at row " + rowCount); l4j.debug(stat + " at row " + rowCount); } } } read.close(); if (reporter != null) { reporter.incrCounter(CrushMapper.CrushCounters.ROWS_WRITTEN, rowCount); } } // end for writer.close(); l4j.info("crushed file written to " + outPath); }