/** * Adds a new HdfsPartition to internal partition list, populating with file format information * and file locations. If a partition contains no files, it's not added. For unchanged files * (indicated by unchanged mtime), reuses the FileDescriptor from the oldFileDescMap. Otherwise, * creates a new FileDescriptor for each modified or new file and adds it to newFileDescs. Returns * new partition or null, if none was added. * * @throws InvalidStorageDescriptorException if the supplied storage descriptor contains metadata * that Impala can't understand. */ private HdfsPartition addPartition( StorageDescriptor storageDescriptor, org.apache.hadoop.hive.metastore.api.Partition msPartition, List<LiteralExpr> partitionKeyExprs, Map<String, FileDescriptor> oldFileDescMap, List<FileDescriptor> newFileDescs) throws IOException, InvalidStorageDescriptorException { HdfsStorageDescriptor fileFormatDescriptor = HdfsStorageDescriptor.fromStorageDescriptor(this.name_, storageDescriptor); Path partDirPath = new Path(storageDescriptor.getLocation()); List<FileDescriptor> fileDescriptors = Lists.newArrayList(); if (DFS.exists(partDirPath)) { // DistributedFilesystem does not have an API that takes in a timestamp and return // a list of files that has been added/changed since. Therefore, we are calling // DFS.listStatus() to list all the files. for (FileStatus fileStatus : DFS.listStatus(partDirPath)) { String fileName = fileStatus.getPath().getName().toString(); if (fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName) || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) { // Ignore directory, hidden file starting with . or _, and LZO index files // If a directory is erroneously created as a subdirectory of a partition dir // we should ignore it and move on. Hive will not recurse into directories. // Skip index files, these are read by the LZO scanner directly. continue; } String fullPath = fileStatus.getPath().toString(); FileDescriptor fd = (oldFileDescMap != null) ? oldFileDescMap.get(fullPath) : null; if (fd != null && fd.getFileLength() == fileStatus.getLen() && fd.getModificationTime() == fileStatus.getModificationTime()) { // Reuse the old file descriptor along with its block metadata if the file // length and mtime has not been changed. } else { // Create a new file descriptor. The block metadata will be populated by // loadFileDescriptorsBlockMd. fd = new FileDescriptor(fullPath, fileStatus.getLen(), fileStatus.getModificationTime()); newFileDescs.add(fd); } fileDescriptors.add(fd); fileDescMap_.put(fullPath, fd); } HdfsPartition partition = new HdfsPartition( this, msPartition, partitionKeyExprs, fileFormatDescriptor, fileDescriptors, getAvailableAccessLevel(partDirPath)); partitions_.add(partition); numHdfsFiles_ += fileDescriptors.size(); totalHdfsBytes_ += partition.getSize(); return partition; } else { LOG.warn("Path " + partDirPath + " does not exist for partition. Ignoring."); return null; } }
/** Populate file block metadata inside each file descriptors. */ private void loadBlockMd(List<FileDescriptor> fileDescriptors) throws RuntimeException { LOG.debug("load block md for " + name_); // Block locations for all the files List<BlockLocation> blockLocations = Lists.newArrayList(); // loop over all files and record their block metadata, minus volume ids for (FileDescriptor fileDescriptor : fileDescriptors) { Path p = new Path(fileDescriptor.getFilePath()); BlockLocation[] locations = null; try { FileStatus fileStatus = DFS.getFileStatus(p); // fileDescriptors should not contain directories. Preconditions.checkArgument(!fileStatus.isDirectory()); locations = DFS.getFileBlockLocations(fileStatus, 0, fileStatus.getLen()); if (locations != null) { blockLocations.addAll(Arrays.asList(locations)); for (int i = 0; i < locations.length; ++i) { FileBlock blockMd = new FileBlock( fileDescriptor.getFilePath(), fileDescriptor.getFileLength(), locations[i]); fileDescriptor.addFileBlock(blockMd); } } } catch (IOException e) { throw new RuntimeException( "couldn't determine block locations for path '" + p + "':\n" + e.getMessage(), e); } } if (!SUPPORTS_VOLUME_ID) { return; } // BlockStorageLocations for all the blocks // block described by blockMetadataList[i] is located at locations[i] BlockStorageLocation[] locations = null; try { // Get the BlockStorageLocations for all the blocks locations = DFS.getFileBlockStorageLocations(blockLocations); } catch (IOException e) { LOG.error("Couldn't determine block storage locations:\n" + e.getMessage()); return; } if (locations == null || locations.length == 0) { LOG.warn("Attempted to get block locations but the call returned nulls"); return; } if (locations.length != blockLocations.size()) { // blocks and locations don't match up LOG.error( "Number of block locations not equal to number of blocks: " + "#locations=" + Long.toString(locations.length) + " #blocks=" + Long.toString(blockLocations.size())); return; } int locationsIdx = 0; int unknownDiskIdCount = 0; for (FileDescriptor fileDescriptor : fileDescriptors) { for (THdfsFileBlock blockMd : fileDescriptor.getFileBlocks()) { VolumeId[] volumeIds = locations[locationsIdx++].getVolumeIds(); // Convert opaque VolumeId to 0 based ids. // TODO: the diskId should be eventually retrievable from Hdfs when // the community agrees this API is useful. int[] diskIds = new int[volumeIds.length]; for (int i = 0; i < volumeIds.length; ++i) { diskIds[i] = getDiskId(volumeIds[i]); if (diskIds[i] < 0) ++unknownDiskIdCount; } FileBlock.setDiskIds(diskIds, blockMd); } } LOG.debug("loaded disk ids for table " + getFullName() + ". nodes: " + getNumNodes()); if (unknownDiskIdCount > 0) { LOG.warn("unknown disk id count " + unknownDiskIdCount); } }