Beispiel #1
0
 // This table has metadata set so the escape is \n, which is also the tuple delim. This
 // test validates that our representation of the catalog fixes this and removes the
 // escape char.
 @Test
 public void TestTableWithBadEscapeChar() throws TableLoadingException {
   HdfsTable table = (HdfsTable) catalog_.getDb("functional").getTable("escapechartesttable");
   List<HdfsPartition> partitions = table.getPartitions();
   for (HdfsPartition p : partitions) {
     HdfsStorageDescriptor desc = p.getInputFormatDescriptor();
     assertEquals(desc.getEscapeChar(), HdfsStorageDescriptor.DEFAULT_ESCAPE_CHAR);
   }
 }
Beispiel #2
0
 private void addDefaultPartition(StorageDescriptor storageDescriptor)
     throws InvalidStorageDescriptorException {
   // Default partition has no files and is not referred to by scan nodes. Data sinks
   // refer to this to understand how to create new partitions
   HdfsStorageDescriptor hdfsStorageDescriptor =
       HdfsStorageDescriptor.fromStorageDescriptor(this.name_, storageDescriptor);
   HdfsPartition partition = HdfsPartition.defaultPartition(this, hdfsStorageDescriptor);
   partitions_.add(partition);
 }
Beispiel #3
0
  public THdfsPartition toThrift() {
    List<TExpr> thriftExprs = Expr.treesToThrift(getPartitionValues());

    return new THdfsPartition(
        (byte) fileFormatDescriptor.getLineDelim(),
        (byte) fileFormatDescriptor.getFieldDelim(),
        (byte) fileFormatDescriptor.getCollectionDelim(),
        (byte) fileFormatDescriptor.getMapKeyDelim(),
        (byte) fileFormatDescriptor.getEscapeChar(),
        fileFormatDescriptor.getFileFormat().toThrift(),
        thriftExprs,
        fileFormatDescriptor.getBlockSize(),
        fileFormatDescriptor.getCompression());
  }
  /**
   * Tests that Impala is able to create an HdfsStorageDescriptor using all combinations of Parquet
   * SerDe class name + input/output format class name.
   */
  @Test
  public void testParquetFileFormat()
      throws DatabaseNotFoundException, InvalidStorageDescriptorException {
    String[] parquetSerDe =
        new String[] {
          "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
          "parquet.hive.serde.ParquetHiveSerDe"
        };
    String[] inputFormats =
        new String[] {
          "com.cloudera.impala.hive.serde.ParquetInputFormat",
          "parquet.hive.DeprecatedParquetInputFormat",
          "parquet.hive.MapredParquetInputFormat",
          "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat"
        };
    String[] outputFormats =
        new String[] {
          "com.cloudera.impala.hive.serde.ParquetOutputFormat",
          "parquet.hive.DeprecatedParquetOutputFormat",
          "parquet.hive.MapredParquetOutputFormat",
          "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat"
        };

    for (String serDe : parquetSerDe) {
      SerDeInfo serDeInfo = new SerDeInfo();
      serDeInfo.setSerializationLib(serDe);
      serDeInfo.setParameters(new HashMap<String, String>());
      for (String inputFormat : inputFormats) {
        for (String outputFormat : outputFormats) {
          StorageDescriptor sd = new StorageDescriptor();
          sd.setSerdeInfo(serDeInfo);
          sd.setInputFormat(inputFormat);
          sd.setOutputFormat(outputFormat);
          assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTblName", sd));
        }
      }
    }
  }
Beispiel #5
0
  /**
   * Adds a new HdfsPartition to internal partition list, populating with file format information
   * and file locations. If a partition contains no files, it's not added. For unchanged files
   * (indicated by unchanged mtime), reuses the FileDescriptor from the oldFileDescMap. Otherwise,
   * creates a new FileDescriptor for each modified or new file and adds it to newFileDescs. Returns
   * new partition or null, if none was added.
   *
   * @throws InvalidStorageDescriptorException if the supplied storage descriptor contains metadata
   *     that Impala can't understand.
   */
  private HdfsPartition addPartition(
      StorageDescriptor storageDescriptor,
      org.apache.hadoop.hive.metastore.api.Partition msPartition,
      List<LiteralExpr> partitionKeyExprs,
      Map<String, FileDescriptor> oldFileDescMap,
      List<FileDescriptor> newFileDescs)
      throws IOException, InvalidStorageDescriptorException {
    HdfsStorageDescriptor fileFormatDescriptor =
        HdfsStorageDescriptor.fromStorageDescriptor(this.name_, storageDescriptor);
    Path partDirPath = new Path(storageDescriptor.getLocation());
    List<FileDescriptor> fileDescriptors = Lists.newArrayList();
    if (DFS.exists(partDirPath)) {
      // DistributedFilesystem does not have an API that takes in a timestamp and return
      // a list of files that has been added/changed since. Therefore, we are calling
      // DFS.listStatus() to list all the files.
      for (FileStatus fileStatus : DFS.listStatus(partDirPath)) {
        String fileName = fileStatus.getPath().getName().toString();
        if (fileStatus.isDirectory()
            || FileSystemUtil.isHiddenFile(fileName)
            || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) {
          // Ignore directory, hidden file starting with . or _, and LZO index files
          // If a directory is erroneously created as a subdirectory of a partition dir
          // we should ignore it and move on. Hive will not recurse into directories.
          // Skip index files, these are read by the LZO scanner directly.
          continue;
        }

        String fullPath = fileStatus.getPath().toString();
        FileDescriptor fd = (oldFileDescMap != null) ? oldFileDescMap.get(fullPath) : null;
        if (fd != null
            && fd.getFileLength() == fileStatus.getLen()
            && fd.getModificationTime() == fileStatus.getModificationTime()) {
          // Reuse the old file descriptor along with its block metadata if the file
          // length and mtime has not been changed.
        } else {
          // Create a new file descriptor. The block metadata will be populated by
          // loadFileDescriptorsBlockMd.
          fd = new FileDescriptor(fullPath, fileStatus.getLen(), fileStatus.getModificationTime());
          newFileDescs.add(fd);
        }
        fileDescriptors.add(fd);
        fileDescMap_.put(fullPath, fd);
      }

      HdfsPartition partition =
          new HdfsPartition(
              this,
              msPartition,
              partitionKeyExprs,
              fileFormatDescriptor,
              fileDescriptors,
              getAvailableAccessLevel(partDirPath));
      partitions_.add(partition);
      numHdfsFiles_ += fileDescriptors.size();
      totalHdfsBytes_ += partition.getSize();
      return partition;
    } else {
      LOG.warn("Path " + partDirPath + " does not exist for partition. Ignoring.");
      return null;
    }
  }
  /**
   * Verifies Impala is able to properly parse delimiters in supported formats. See
   * HdfsStorageDescriptor.parseDelim() for details.
   */
  @Test
  public void testDelimiters() throws InvalidStorageDescriptorException {
    StorageDescriptor sd =
        HiveStorageDescriptorFactory.createSd(THdfsFileFormat.TEXT, RowFormat.DEFAULT_ROW_FORMAT);
    sd.setParameters(new HashMap<String, String>());
    sd.getSerdeInfo().setParameters(new HashMap<String, String>());
    sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "-2");
    assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTbl", sd));

    sd.getSerdeInfo().setParameters(new HashMap<String, String>());
    sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "-128");
    assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTbl", sd));

    sd.getSerdeInfo().setParameters(new HashMap<String, String>());
    sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "127");
    assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTbl", sd));

    sd.getSerdeInfo().setParameters(new HashMap<String, String>());
    sd.getSerdeInfo().putToParameters(serdeConstants.LINE_DELIM, "\001");
    assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTbl", sd));

    sd.getSerdeInfo().setParameters(new HashMap<String, String>());
    sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "|");
    assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTbl", sd));

    sd.getSerdeInfo().setParameters(new HashMap<String, String>());
    sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "\t");
    assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTbl", sd));

    sd.getSerdeInfo().setParameters(new HashMap<String, String>());
    sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "ab");
    try {
      HdfsStorageDescriptor.fromStorageDescriptor("fake", sd);
      fail();
    } catch (HdfsStorageDescriptor.InvalidStorageDescriptorException e) {
      assertEquals(
          "Invalid delimiter: 'ab'. Delimiter must be specified as a "
              + "single character or as a decimal value in the range [-128:127]",
          e.getMessage());
    }

    sd.getSerdeInfo().setParameters(new HashMap<String, String>());
    sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "128");
    try {
      HdfsStorageDescriptor.fromStorageDescriptor("fake", sd);
      fail();
    } catch (HdfsStorageDescriptor.InvalidStorageDescriptorException e) {
      assertEquals(
          "Invalid delimiter: '128'. Delimiter must be specified as a "
              + "single character or as a decimal value in the range [-128:127]",
          e.getMessage());
    }

    sd.getSerdeInfo().setParameters(new HashMap<String, String>());
    sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "\128");
    try {
      HdfsStorageDescriptor.fromStorageDescriptor("fake", sd);
      fail();
    } catch (HdfsStorageDescriptor.InvalidStorageDescriptorException e) {
      assertEquals(
          "Invalid delimiter: '\128'. Delimiter must be specified as a "
              + "single character or as a decimal value in the range [-128:127]",
          e.getMessage());
    }

    sd.getSerdeInfo().setParameters(new HashMap<String, String>());
    sd.getSerdeInfo().putToParameters(serdeConstants.LINE_DELIM, "-129");
    try {
      HdfsStorageDescriptor.fromStorageDescriptor("fake", sd);
      fail();
    } catch (HdfsStorageDescriptor.InvalidStorageDescriptorException e) {
      assertEquals(
          "Invalid delimiter: '-129'. Delimiter must be specified as a "
              + "single character or as a decimal value in the range [-128:127]",
          e.getMessage());
    }
  }