static GlobalMetaData mergeInto( FileMetaData toMerge, GlobalMetaData mergedMetadata, boolean strict) { MessageType schema = null; Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>(); Set<String> createdBy = new HashSet<String>(); if (mergedMetadata != null) { schema = mergedMetadata.getSchema(); newKeyValues.putAll(mergedMetadata.getKeyValueMetaData()); createdBy.addAll(mergedMetadata.getCreatedBy()); } if ((schema == null && toMerge.getSchema() != null) || (schema != null && !schema.equals(toMerge.getSchema()))) { schema = mergeInto(toMerge.getSchema(), schema, strict); } for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) { Set<String> values = newKeyValues.get(entry.getKey()); if (values == null) { values = new HashSet<String>(); newKeyValues.put(entry.getKey(), values); } values.add(entry.getValue()); } createdBy.add(toMerge.getCreatedBy()); return new GlobalMetaData(schema, newKeyValues, createdBy); }
public static parquet.schema.Type getParquetType( HiveColumnHandle column, MessageType messageType, boolean useParquetColumnNames) { if (useParquetColumnNames) { return getParquetTypeByName(column.getName(), messageType); } if (column.getHiveColumnIndex() < messageType.getFieldCount()) { return messageType.getType(column.getHiveColumnIndex()); } return null; }
private parquet.schema.Type getParquetType(HiveColumnHandle column, MessageType messageType) { if (useParquetColumnNames) { if (messageType.containsField(column.getName())) { return messageType.getType(column.getName()); } return null; } if (column.getHiveColumnIndex() < messageType.getFieldCount()) { return messageType.getType(column.getHiveColumnIndex()); } return null; }
private static parquet.schema.Type getParquetTypeByName( String columnName, MessageType messageType) { if (messageType.containsField(columnName)) { return messageType.getType(columnName); } // parquet is case-sensitive, but hive is not. all hive columns get converted to lowercase // check for direct match above but if no match found, try case-insensitive match for (Type type : messageType.getFields()) { if (type.getName().equalsIgnoreCase(columnName)) { return type; } } return null; }
private List<ParquetInputSplit> generateSplitByDeprecatedConstructor(long min, long max) throws IOException { List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>(); List<ClientSideMetadataSplitStrategy.SplitInfo> splitInfos = ClientSideMetadataSplitStrategy.generateSplitInfo(blocks, hdfsBlocks, min, max); for (ClientSideMetadataSplitStrategy.SplitInfo splitInfo : splitInfos) { BlockMetaData lastRowGroup = splitInfo.getRowGroups().get(splitInfo.getRowGroupCount() - 1); long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize(); ParquetInputSplit split = new ParquetInputSplit( fileStatus.getPath(), splitInfo.hdfsBlock.getOffset(), end, splitInfo.hdfsBlock.getHosts(), splitInfo.rowGroups, schema.toString(), null, null, extramd); splits.add(split); } return splits; }
/** * will return the result of merging toMerge into mergedSchema * * @param toMerge the schema to merge into mergedSchema * @param mergedSchema the schema to append the fields to * @param strict should schema primitive types match * @return the resulting schema */ static MessageType mergeInto(MessageType toMerge, MessageType mergedSchema, boolean strict) { if (mergedSchema == null) { return toMerge; } return mergedSchema.union(toMerge, strict); }
private void validateContains( MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes) throws IOException { PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path)); Page page = pageReader.readPage(); assertEquals(values, page.getValueCount()); assertArrayEquals(bytes.toByteArray(), page.getBytes().toByteArray()); }
@Override public Page getNextPage() { try { batchId++; long start = System.nanoTime(); int batchSize = parquetReader.nextBatch(); readTimeNanos += System.nanoTime() - start; if (closed || batchSize <= 0) { close(); return null; } Block[] blocks = new Block[hiveColumnIndexes.length]; for (int fieldId = 0; fieldId < blocks.length; fieldId++) { Type type = types.get(fieldId); if (constantBlocks[fieldId] != null) { blocks[fieldId] = constantBlocks[fieldId].getRegion(0, batchSize); } else { int fieldIndex = requestedSchema.getFieldIndex(columnNames.get(fieldId)); ColumnDescriptor columnDescriptor = requestedSchema.getColumns().get(fieldIndex); blocks[fieldId] = new LazyBlock(batchSize, new ParquetBlockLoader(columnDescriptor, type)); } } return new Page(batchSize, blocks); } catch (PrestoException e) { closeWithSuppression(e); throw e; } catch (IOException | RuntimeException | InterruptedException e) { if (e instanceof InterruptedException) { Thread.currentThread().interrupt(); } closeWithSuppression(e); throw new PrestoException(HIVE_CURSOR_ERROR, e); } }
@Override @SuppressWarnings("deprecation") public ReadContext init( Configuration configuration, Map<String, String> keyValueMetaData, MessageType messageType) { List<parquet.schema.Type> fields = columns .stream() .filter(column -> !column.isPartitionKey()) .map(column -> getParquetType(column, messageType)) .filter(Objects::nonNull) .collect(toList()); MessageType requestedProjection = new MessageType(messageType.getName(), fields); return new ReadContext(requestedProjection); }
/** {@inheritDoc} */ @Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { Configuration configuration = taskAttemptContext.getConfiguration(); ParquetInputSplit parquetInputSplit = (ParquetInputSplit) inputSplit; this.requestedSchema = MessageTypeParser.parseMessageType(parquetInputSplit.getRequestedSchema()); this.columnCount = this.requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, parquetInputSplit.getExtraMetadata(), MessageTypeParser.parseMessageType(parquetInputSplit.getSchema()), new ReadSupport.ReadContext(requestedSchema)); Path path = parquetInputSplit.getPath(); List<BlockMetaData> blocks = parquetInputSplit.getBlocks(); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, path, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } LOG.info("RecordReader initialized will read a total of " + total + " records."); }
private List<ParquetInputSplit> generateSplitByMinMaxSize(long min, long max) throws IOException { return ClientSideMetadataSplitStrategy.generateSplits( blocks, hdfsBlocks, fileStatus, schema.toString(), extramd, min, max); }