// This table has metadata set so the escape is \n, which is also the tuple delim. This // test validates that our representation of the catalog fixes this and removes the // escape char. @Test public void TestTableWithBadEscapeChar() throws TableLoadingException { HdfsTable table = (HdfsTable) catalog_.getDb("functional").getTable("escapechartesttable"); List<HdfsPartition> partitions = table.getPartitions(); for (HdfsPartition p : partitions) { HdfsStorageDescriptor desc = p.getInputFormatDescriptor(); assertEquals(desc.getEscapeChar(), HdfsStorageDescriptor.DEFAULT_ESCAPE_CHAR); } }
private void addDefaultPartition(StorageDescriptor storageDescriptor) throws InvalidStorageDescriptorException { // Default partition has no files and is not referred to by scan nodes. Data sinks // refer to this to understand how to create new partitions HdfsStorageDescriptor hdfsStorageDescriptor = HdfsStorageDescriptor.fromStorageDescriptor(this.name_, storageDescriptor); HdfsPartition partition = HdfsPartition.defaultPartition(this, hdfsStorageDescriptor); partitions_.add(partition); }
public THdfsPartition toThrift() { List<TExpr> thriftExprs = Expr.treesToThrift(getPartitionValues()); return new THdfsPartition( (byte) fileFormatDescriptor.getLineDelim(), (byte) fileFormatDescriptor.getFieldDelim(), (byte) fileFormatDescriptor.getCollectionDelim(), (byte) fileFormatDescriptor.getMapKeyDelim(), (byte) fileFormatDescriptor.getEscapeChar(), fileFormatDescriptor.getFileFormat().toThrift(), thriftExprs, fileFormatDescriptor.getBlockSize(), fileFormatDescriptor.getCompression()); }
/** * Tests that Impala is able to create an HdfsStorageDescriptor using all combinations of Parquet * SerDe class name + input/output format class name. */ @Test public void testParquetFileFormat() throws DatabaseNotFoundException, InvalidStorageDescriptorException { String[] parquetSerDe = new String[] { "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", "parquet.hive.serde.ParquetHiveSerDe" }; String[] inputFormats = new String[] { "com.cloudera.impala.hive.serde.ParquetInputFormat", "parquet.hive.DeprecatedParquetInputFormat", "parquet.hive.MapredParquetInputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat" }; String[] outputFormats = new String[] { "com.cloudera.impala.hive.serde.ParquetOutputFormat", "parquet.hive.DeprecatedParquetOutputFormat", "parquet.hive.MapredParquetOutputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat" }; for (String serDe : parquetSerDe) { SerDeInfo serDeInfo = new SerDeInfo(); serDeInfo.setSerializationLib(serDe); serDeInfo.setParameters(new HashMap<String, String>()); for (String inputFormat : inputFormats) { for (String outputFormat : outputFormats) { StorageDescriptor sd = new StorageDescriptor(); sd.setSerdeInfo(serDeInfo); sd.setInputFormat(inputFormat); sd.setOutputFormat(outputFormat); assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTblName", sd)); } } } }
/** * Adds a new HdfsPartition to internal partition list, populating with file format information * and file locations. If a partition contains no files, it's not added. For unchanged files * (indicated by unchanged mtime), reuses the FileDescriptor from the oldFileDescMap. Otherwise, * creates a new FileDescriptor for each modified or new file and adds it to newFileDescs. Returns * new partition or null, if none was added. * * @throws InvalidStorageDescriptorException if the supplied storage descriptor contains metadata * that Impala can't understand. */ private HdfsPartition addPartition( StorageDescriptor storageDescriptor, org.apache.hadoop.hive.metastore.api.Partition msPartition, List<LiteralExpr> partitionKeyExprs, Map<String, FileDescriptor> oldFileDescMap, List<FileDescriptor> newFileDescs) throws IOException, InvalidStorageDescriptorException { HdfsStorageDescriptor fileFormatDescriptor = HdfsStorageDescriptor.fromStorageDescriptor(this.name_, storageDescriptor); Path partDirPath = new Path(storageDescriptor.getLocation()); List<FileDescriptor> fileDescriptors = Lists.newArrayList(); if (DFS.exists(partDirPath)) { // DistributedFilesystem does not have an API that takes in a timestamp and return // a list of files that has been added/changed since. Therefore, we are calling // DFS.listStatus() to list all the files. for (FileStatus fileStatus : DFS.listStatus(partDirPath)) { String fileName = fileStatus.getPath().getName().toString(); if (fileStatus.isDirectory() || FileSystemUtil.isHiddenFile(fileName) || HdfsCompression.fromFileName(fileName) == HdfsCompression.LZO_INDEX) { // Ignore directory, hidden file starting with . or _, and LZO index files // If a directory is erroneously created as a subdirectory of a partition dir // we should ignore it and move on. Hive will not recurse into directories. // Skip index files, these are read by the LZO scanner directly. continue; } String fullPath = fileStatus.getPath().toString(); FileDescriptor fd = (oldFileDescMap != null) ? oldFileDescMap.get(fullPath) : null; if (fd != null && fd.getFileLength() == fileStatus.getLen() && fd.getModificationTime() == fileStatus.getModificationTime()) { // Reuse the old file descriptor along with its block metadata if the file // length and mtime has not been changed. } else { // Create a new file descriptor. The block metadata will be populated by // loadFileDescriptorsBlockMd. fd = new FileDescriptor(fullPath, fileStatus.getLen(), fileStatus.getModificationTime()); newFileDescs.add(fd); } fileDescriptors.add(fd); fileDescMap_.put(fullPath, fd); } HdfsPartition partition = new HdfsPartition( this, msPartition, partitionKeyExprs, fileFormatDescriptor, fileDescriptors, getAvailableAccessLevel(partDirPath)); partitions_.add(partition); numHdfsFiles_ += fileDescriptors.size(); totalHdfsBytes_ += partition.getSize(); return partition; } else { LOG.warn("Path " + partDirPath + " does not exist for partition. Ignoring."); return null; } }
/** * Verifies Impala is able to properly parse delimiters in supported formats. See * HdfsStorageDescriptor.parseDelim() for details. */ @Test public void testDelimiters() throws InvalidStorageDescriptorException { StorageDescriptor sd = HiveStorageDescriptorFactory.createSd(THdfsFileFormat.TEXT, RowFormat.DEFAULT_ROW_FORMAT); sd.setParameters(new HashMap<String, String>()); sd.getSerdeInfo().setParameters(new HashMap<String, String>()); sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "-2"); assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTbl", sd)); sd.getSerdeInfo().setParameters(new HashMap<String, String>()); sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "-128"); assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTbl", sd)); sd.getSerdeInfo().setParameters(new HashMap<String, String>()); sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "127"); assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTbl", sd)); sd.getSerdeInfo().setParameters(new HashMap<String, String>()); sd.getSerdeInfo().putToParameters(serdeConstants.LINE_DELIM, "\001"); assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTbl", sd)); sd.getSerdeInfo().setParameters(new HashMap<String, String>()); sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "|"); assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTbl", sd)); sd.getSerdeInfo().setParameters(new HashMap<String, String>()); sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "\t"); assertNotNull(HdfsStorageDescriptor.fromStorageDescriptor("fakeTbl", sd)); sd.getSerdeInfo().setParameters(new HashMap<String, String>()); sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "ab"); try { HdfsStorageDescriptor.fromStorageDescriptor("fake", sd); fail(); } catch (HdfsStorageDescriptor.InvalidStorageDescriptorException e) { assertEquals( "Invalid delimiter: 'ab'. Delimiter must be specified as a " + "single character or as a decimal value in the range [-128:127]", e.getMessage()); } sd.getSerdeInfo().setParameters(new HashMap<String, String>()); sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "128"); try { HdfsStorageDescriptor.fromStorageDescriptor("fake", sd); fail(); } catch (HdfsStorageDescriptor.InvalidStorageDescriptorException e) { assertEquals( "Invalid delimiter: '128'. Delimiter must be specified as a " + "single character or as a decimal value in the range [-128:127]", e.getMessage()); } sd.getSerdeInfo().setParameters(new HashMap<String, String>()); sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, "\128"); try { HdfsStorageDescriptor.fromStorageDescriptor("fake", sd); fail(); } catch (HdfsStorageDescriptor.InvalidStorageDescriptorException e) { assertEquals( "Invalid delimiter: '\128'. Delimiter must be specified as a " + "single character or as a decimal value in the range [-128:127]", e.getMessage()); } sd.getSerdeInfo().setParameters(new HashMap<String, String>()); sd.getSerdeInfo().putToParameters(serdeConstants.LINE_DELIM, "-129"); try { HdfsStorageDescriptor.fromStorageDescriptor("fake", sd); fail(); } catch (HdfsStorageDescriptor.InvalidStorageDescriptorException e) { assertEquals( "Invalid delimiter: '-129'. Delimiter must be specified as a " + "single character or as a decimal value in the range [-128:127]", e.getMessage()); } }