Exemple #1
0
 /**
  * Counts the number of unique data node network addresses for all file blocks in the given
  * partition.
  */
 private static int countUniqueDataNetworkLocations(List<HdfsPartition> partitions) {
   Set<TNetworkAddress> uniqueHostPorts = Sets.newHashSet();
   for (HdfsPartition partition : partitions) {
     for (FileDescriptor fileDesc : partition.getFileDescriptors()) {
       for (THdfsFileBlock blockMd : fileDesc.getFileBlocks()) {
         for (TNetworkAddress networkAddress : blockMd.getNetwork_addresses()) {
           uniqueHostPorts.add(networkAddress);
         }
       }
     }
   }
   return uniqueHostPorts.size();
 }
Exemple #2
0
  /**
   * Adds a new HdfsPartition to internal partition list, populating with file format information
   * and file locations. If a partition contains no files, it's not added. For unchanged files
   * (indicated by unchanged mtime), reuses the FileDescriptor from the oldFileDescMap. Otherwise,
   * creates a new FileDescriptor for each modified or new file and adds it to newFileDescs. Returns
   * new partition or null, if none was added.
   *
   * @throws InvalidStorageDescriptorException if the supplied storage descriptor contains metadata
   *     that Impala can't understand.
   */
  private HdfsPartition addPartition(
      StorageDescriptor storageDescriptor,
      org.apache.hadoop.hive.metastore.api.Partition msPartition,
      List<LiteralExpr> partitionKeyExprs,
      Map<String, FileDescriptor> oldFileDescMap,
      List<FileDescriptor> newFileDescs)
      throws IOException, InvalidStorageDescriptorException {
    HdfsStorageDescriptor fileFormatDescriptor =
        HdfsStorageDescriptor.fromStorageDescriptor(this.name_, storageDescriptor);
    Path partDirPath = new Path(storageDescriptor.getLocation());
    List<FileDescriptor> fileDescriptors = Lists.newArrayList();
    if (DFS.exists(partDirPath)) {
      // DistributedFilesystem does not have an API that takes in a timestamp and return
      // a list of files that has been added/changed since. Therefore, we are calling
      // DFS.listStatus() to list all the files.
      for (FileStatus fileStatus : DFS.listStatus(partDirPath)) {
        String fileName = fileStatus.getPath().getName().toString();
        if (fileStatus.isDirectory()
            || FileSystemUtil.isHiddenFile(fileName)
            || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) {
          // Ignore directory, hidden file starting with . or _, and LZO index files
          // If a directory is erroneously created as a subdirectory of a partition dir
          // we should ignore it and move on. Hive will not recurse into directories.
          // Skip index files, these are read by the LZO scanner directly.
          continue;
        }

        String fullPath = fileStatus.getPath().toString();
        FileDescriptor fd = (oldFileDescMap != null) ? oldFileDescMap.get(fullPath) : null;
        if (fd != null
            && fd.getFileLength() == fileStatus.getLen()
            && fd.getModificationTime() == fileStatus.getModificationTime()) {
          // Reuse the old file descriptor along with its block metadata if the file
          // length and mtime has not been changed.
        } else {
          // Create a new file descriptor. The block metadata will be populated by
          // loadFileDescriptorsBlockMd.
          fd = new FileDescriptor(fullPath, fileStatus.getLen(), fileStatus.getModificationTime());
          newFileDescs.add(fd);
        }
        fileDescriptors.add(fd);
        fileDescMap_.put(fullPath, fd);
      }

      HdfsPartition partition =
          new HdfsPartition(
              this,
              msPartition,
              partitionKeyExprs,
              fileFormatDescriptor,
              fileDescriptors,
              getAvailableAccessLevel(partDirPath));
      partitions_.add(partition);
      numHdfsFiles_ += fileDescriptors.size();
      totalHdfsBytes_ += partition.getSize();
      return partition;
    } else {
      LOG.warn("Path " + partDirPath + " does not exist for partition. Ignoring.");
      return null;
    }
  }
Exemple #3
0
  /** Populate file block metadata inside each file descriptors. */
  private void loadBlockMd(List<FileDescriptor> fileDescriptors) throws RuntimeException {
    LOG.debug("load block md for " + name_);
    // Block locations for all the files
    List<BlockLocation> blockLocations = Lists.newArrayList();

    // loop over all files and record their block metadata, minus volume ids
    for (FileDescriptor fileDescriptor : fileDescriptors) {
      Path p = new Path(fileDescriptor.getFilePath());
      BlockLocation[] locations = null;
      try {
        FileStatus fileStatus = DFS.getFileStatus(p);
        // fileDescriptors should not contain directories.
        Preconditions.checkArgument(!fileStatus.isDirectory());
        locations = DFS.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
        if (locations != null) {
          blockLocations.addAll(Arrays.asList(locations));
          for (int i = 0; i < locations.length; ++i) {
            FileBlock blockMd =
                new FileBlock(
                    fileDescriptor.getFilePath(), fileDescriptor.getFileLength(), locations[i]);
            fileDescriptor.addFileBlock(blockMd);
          }
        }
      } catch (IOException e) {
        throw new RuntimeException(
            "couldn't determine block locations for path '" + p + "':\n" + e.getMessage(), e);
      }
    }

    if (!SUPPORTS_VOLUME_ID) {
      return;
    }

    // BlockStorageLocations for all the blocks
    // block described by blockMetadataList[i] is located at locations[i]
    BlockStorageLocation[] locations = null;
    try {
      // Get the BlockStorageLocations for all the blocks
      locations = DFS.getFileBlockStorageLocations(blockLocations);
    } catch (IOException e) {
      LOG.error("Couldn't determine block storage locations:\n" + e.getMessage());
      return;
    }

    if (locations == null || locations.length == 0) {
      LOG.warn("Attempted to get block locations but the call returned nulls");
      return;
    }

    if (locations.length != blockLocations.size()) {
      // blocks and locations don't match up
      LOG.error(
          "Number of block locations not equal to number of blocks: "
              + "#locations="
              + Long.toString(locations.length)
              + " #blocks="
              + Long.toString(blockLocations.size()));
      return;
    }

    int locationsIdx = 0;
    int unknownDiskIdCount = 0;
    for (FileDescriptor fileDescriptor : fileDescriptors) {
      for (THdfsFileBlock blockMd : fileDescriptor.getFileBlocks()) {
        VolumeId[] volumeIds = locations[locationsIdx++].getVolumeIds();
        // Convert opaque VolumeId to 0 based ids.
        // TODO: the diskId should be eventually retrievable from Hdfs when
        // the community agrees this API is useful.
        int[] diskIds = new int[volumeIds.length];
        for (int i = 0; i < volumeIds.length; ++i) {
          diskIds[i] = getDiskId(volumeIds[i]);
          if (diskIds[i] < 0) ++unknownDiskIdCount;
        }
        FileBlock.setDiskIds(diskIds, blockMd);
      }
    }
    LOG.debug("loaded disk ids for table " + getFullName() + ". nodes: " + getNumNodes());
    if (unknownDiskIdCount > 0) {
      LOG.warn("unknown disk id count " + unknownDiskIdCount);
    }
  }