static GlobalMetaData mergeInto(
     FileMetaData toMerge, GlobalMetaData mergedMetadata, boolean strict) {
   MessageType schema = null;
   Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>();
   Set<String> createdBy = new HashSet<String>();
   if (mergedMetadata != null) {
     schema = mergedMetadata.getSchema();
     newKeyValues.putAll(mergedMetadata.getKeyValueMetaData());
     createdBy.addAll(mergedMetadata.getCreatedBy());
   }
   if ((schema == null && toMerge.getSchema() != null)
       || (schema != null && !schema.equals(toMerge.getSchema()))) {
     schema = mergeInto(toMerge.getSchema(), schema, strict);
   }
   for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) {
     Set<String> values = newKeyValues.get(entry.getKey());
     if (values == null) {
       values = new HashSet<String>();
       newKeyValues.put(entry.getKey(), values);
     }
     values.add(entry.getValue());
   }
   createdBy.add(toMerge.getCreatedBy());
   return new GlobalMetaData(schema, newKeyValues, createdBy);
 }
Example #2
0
  public static parquet.schema.Type getParquetType(
      HiveColumnHandle column, MessageType messageType, boolean useParquetColumnNames) {
    if (useParquetColumnNames) {
      return getParquetTypeByName(column.getName(), messageType);
    }

    if (column.getHiveColumnIndex() < messageType.getFieldCount()) {
      return messageType.getType(column.getHiveColumnIndex());
    }
    return null;
  }
    private parquet.schema.Type getParquetType(HiveColumnHandle column, MessageType messageType) {
      if (useParquetColumnNames) {
        if (messageType.containsField(column.getName())) {
          return messageType.getType(column.getName());
        }
        return null;
      }

      if (column.getHiveColumnIndex() < messageType.getFieldCount()) {
        return messageType.getType(column.getHiveColumnIndex());
      }
      return null;
    }
Example #4
0
  private static parquet.schema.Type getParquetTypeByName(
      String columnName, MessageType messageType) {
    if (messageType.containsField(columnName)) {
      return messageType.getType(columnName);
    }
    // parquet is case-sensitive, but hive is not. all hive columns get converted to lowercase
    // check for direct match above but if no match found, try case-insensitive match
    for (Type type : messageType.getFields()) {
      if (type.getName().equalsIgnoreCase(columnName)) {
        return type;
      }
    }

    return null;
  }
Example #5
0
  private List<ParquetInputSplit> generateSplitByDeprecatedConstructor(long min, long max)
      throws IOException {
    List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
    List<ClientSideMetadataSplitStrategy.SplitInfo> splitInfos =
        ClientSideMetadataSplitStrategy.generateSplitInfo(blocks, hdfsBlocks, min, max);

    for (ClientSideMetadataSplitStrategy.SplitInfo splitInfo : splitInfos) {
      BlockMetaData lastRowGroup = splitInfo.getRowGroups().get(splitInfo.getRowGroupCount() - 1);
      long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();

      ParquetInputSplit split =
          new ParquetInputSplit(
              fileStatus.getPath(),
              splitInfo.hdfsBlock.getOffset(),
              end,
              splitInfo.hdfsBlock.getHosts(),
              splitInfo.rowGroups,
              schema.toString(),
              null,
              null,
              extramd);
      splits.add(split);
    }

    return splits;
  }
  /**
   * will return the result of merging toMerge into mergedSchema
   *
   * @param toMerge the schema to merge into mergedSchema
   * @param mergedSchema the schema to append the fields to
   * @param strict should schema primitive types match
   * @return the resulting schema
   */
  static MessageType mergeInto(MessageType toMerge, MessageType mergedSchema, boolean strict) {
    if (mergedSchema == null) {
      return toMerge;
    }

    return mergedSchema.union(toMerge, strict);
  }
 private void validateContains(
     MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes)
     throws IOException {
   PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
   Page page = pageReader.readPage();
   assertEquals(values, page.getValueCount());
   assertArrayEquals(bytes.toByteArray(), page.getBytes().toByteArray());
 }
Example #8
0
  @Override
  public Page getNextPage() {
    try {
      batchId++;
      long start = System.nanoTime();

      int batchSize = parquetReader.nextBatch();

      readTimeNanos += System.nanoTime() - start;

      if (closed || batchSize <= 0) {
        close();
        return null;
      }

      Block[] blocks = new Block[hiveColumnIndexes.length];
      for (int fieldId = 0; fieldId < blocks.length; fieldId++) {
        Type type = types.get(fieldId);
        if (constantBlocks[fieldId] != null) {
          blocks[fieldId] = constantBlocks[fieldId].getRegion(0, batchSize);
        } else {
          int fieldIndex = requestedSchema.getFieldIndex(columnNames.get(fieldId));
          ColumnDescriptor columnDescriptor = requestedSchema.getColumns().get(fieldIndex);
          blocks[fieldId] =
              new LazyBlock(batchSize, new ParquetBlockLoader(columnDescriptor, type));
        }
      }
      return new Page(batchSize, blocks);
    } catch (PrestoException e) {
      closeWithSuppression(e);
      throw e;
    } catch (IOException | RuntimeException | InterruptedException e) {
      if (e instanceof InterruptedException) {
        Thread.currentThread().interrupt();
      }
      closeWithSuppression(e);
      throw new PrestoException(HIVE_CURSOR_ERROR, e);
    }
  }
 @Override
 @SuppressWarnings("deprecation")
 public ReadContext init(
     Configuration configuration,
     Map<String, String> keyValueMetaData,
     MessageType messageType) {
   List<parquet.schema.Type> fields =
       columns
           .stream()
           .filter(column -> !column.isPartitionKey())
           .map(column -> getParquetType(column, messageType))
           .filter(Objects::nonNull)
           .collect(toList());
   MessageType requestedProjection = new MessageType(messageType.getName(), fields);
   return new ReadContext(requestedProjection);
 }
Example #10
0
  /** {@inheritDoc} */
  @Override
  public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
      throws IOException, InterruptedException {
    Configuration configuration = taskAttemptContext.getConfiguration();
    ParquetInputSplit parquetInputSplit = (ParquetInputSplit) inputSplit;
    this.requestedSchema =
        MessageTypeParser.parseMessageType(parquetInputSplit.getRequestedSchema());
    this.columnCount = this.requestedSchema.getPaths().size();
    this.recordConverter =
        readSupport.prepareForRead(
            configuration,
            parquetInputSplit.getExtraMetadata(),
            MessageTypeParser.parseMessageType(parquetInputSplit.getSchema()),
            new ReadSupport.ReadContext(requestedSchema));

    Path path = parquetInputSplit.getPath();
    List<BlockMetaData> blocks = parquetInputSplit.getBlocks();
    List<ColumnDescriptor> columns = requestedSchema.getColumns();
    reader = new ParquetFileReader(configuration, path, blocks, columns);
    for (BlockMetaData block : blocks) {
      total += block.getRowCount();
    }
    LOG.info("RecordReader initialized will read a total of " + total + " records.");
  }
Example #11
0
 private List<ParquetInputSplit> generateSplitByMinMaxSize(long min, long max) throws IOException {
   return ClientSideMetadataSplitStrategy.generateSplits(
       blocks, hdfsBlocks, fileStatus, schema.toString(), extramd, min, max);
 }