private void finalizePartitionsUpTo(String topic, Calendar calendar) throws IOException, ParseException, InterruptedException { NavigableSet<Calendar> partitionDates = getPartitions(topic).headSet(calendar, true).descendingSet(); final String s3Prefix = "s3n://" + mConfig.getS3Bucket() + "/" + mConfig.getS3Path(); SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); format.setTimeZone(TimeZone.getTimeZone("UTC")); for (Calendar partition : partitionDates) { String partitionStr = format.format(partition.getTime()); String[] partitions = {"dt=" + partitionStr}; LogFilePath logFilePath = new LogFilePath( s3Prefix, topic, partitions, mConfig.getGeneration(), 0, 0, mFileExtension); String logFileDir = logFilePath.getLogFileDir(); assert FileUtil.exists(logFileDir) : "FileUtil.exists(" + logFileDir + ")"; String successFilePath = logFileDir + "/_SUCCESS"; if (FileUtil.exists(successFilePath)) { return; } try { mQuboleClient.addPartition(mConfig.getHivePrefix() + topic, "dt='" + partitionStr + "'"); } catch (Exception e) { LOG.error("failed to finalize topic " + topic + " partition dt=" + partitionStr, e); continue; } LOG.info("touching file " + successFilePath); FileUtil.touch(successFilePath); } }
private void populateTopicPartitionToOffsetToFiles() throws IOException { String prefix = getPrefix(); String topicPrefix = getTopicPrefix(); String[] paths = FileUtil.listRecursively(topicPrefix); for (String path : paths) { if (!path.endsWith("/_SUCCESS")) { LogFilePath logFilePath = new LogFilePath(prefix, path); TopicPartition topicPartition = new TopicPartition(logFilePath.getTopic(), logFilePath.getKafkaPartition()); SortedMap<Long, HashSet<LogFilePath>> offsetToFiles = mTopicPartitionToOffsetToFiles.get(topicPartition); if (offsetToFiles == null) { offsetToFiles = new TreeMap<Long, HashSet<LogFilePath>>(); mTopicPartitionToOffsetToFiles.put(topicPartition, offsetToFiles); } long offset = logFilePath.getOffset(); HashSet<LogFilePath> logFilePaths = offsetToFiles.get(offset); if (logFilePaths == null) { logFilePaths = new HashSet<LogFilePath>(); offsetToFiles.put(offset, logFilePaths); } logFilePaths.add(logFilePath); } } }
private int getMessageCount(LogFilePath logFilePath) throws Exception { String path = logFilePath.getLogFilePath(); Path fsPath = new Path(path); FileSystem fileSystem = FileUtil.getFileSystem(path); SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, fsPath, new Configuration()); LongWritable key = (LongWritable) reader.getKeyClass().newInstance(); BytesWritable value = (BytesWritable) reader.getValueClass().newInstance(); int result = 0; while (reader.next(key, value)) { result++; } reader.close(); return result; }
private void getOffsets(LogFilePath logFilePath, Set<Long> offsets) throws Exception { String path = logFilePath.getLogFilePath(); Path fsPath = new Path(path); FileSystem fileSystem = FileUtil.getFileSystem(path); SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, fsPath, new Configuration()); LongWritable key = (LongWritable) reader.getKeyClass().newInstance(); BytesWritable value = (BytesWritable) reader.getValueClass().newInstance(); while (reader.next(key, value)) { if (!offsets.add(key.get())) { throw new RuntimeException( "duplicate key " + key.get() + " found in file " + logFilePath.getLogFilePath()); } } reader.close(); }
private NavigableSet<Calendar> getPartitions(String topic) throws IOException, ParseException { final String s3Prefix = "s3n://" + mConfig.getS3Bucket() + "/" + mConfig.getS3Path(); String[] partitions = {"dt="}; LogFilePath logFilePath = new LogFilePath(s3Prefix, topic, partitions, mConfig.getGeneration(), 0, 0, mFileExtension); String parentDir = logFilePath.getLogFileParentDir(); String[] partitionDirs = FileUtil.list(parentDir); Pattern pattern = Pattern.compile(".*/dt=(\\d\\d\\d\\d-\\d\\d-\\d\\d)$"); TreeSet<Calendar> result = new TreeSet<Calendar>(); for (String partitionDir : partitionDirs) { Matcher matcher = pattern.matcher(partitionDir); if (matcher.find()) { String date = matcher.group(1); SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); format.setTimeZone(TimeZone.getTimeZone("UTC")); Calendar calendar = Calendar.getInstance(TimeZone.getTimeZone("UTC")); calendar.setTime(format.parse(date)); result.add(calendar); } } return result; }