public static MessageType convert(StructType struct, FieldProjectionFilter filter) { State state = new State(new FieldsPath(), REPEATED, "ParquetSchema"); ConvertedField converted = struct.accept(new ThriftSchemaConvertVisitor(filter, true), state); if (!converted.isKeep()) { throw new ThriftProjectionException("No columns have been selected"); } return new MessageType(state.name, converted.asKeep().getType().asGroupType().getFields()); }
private ConvertedField visitListLike(ThriftField listLike, State state, boolean isSet) { State childState = new State(state.path, REPEATED, state.name + "_tuple"); ConvertedField converted = listLike.getType().accept(this, childState); if (converted.isKeep()) { // doProjection prevents an infinite recursion here if (isSet && doProjection) { ConvertedField fullConv = listLike .getType() .accept( new ThriftSchemaConvertVisitor(FieldProjectionFilter.ALL_COLUMNS, false), childState); if (!converted.asKeep().getType().equals(fullConv.asKeep().getType())) { throw new ThriftProjectionException( "Cannot select only a subset of the fields in a set, " + "for path " + state.path); } } return new Keep( state.path, listType(state.repetition, state.name, converted.asKeep().getType())); } return new Drop(state.path); }
@Override public ConvertedField visit(StructType structType, State state) { // special care is taken when converting unions, // because we are actually both converting + projecting in // one pass, and unions need special handling when projecting. final boolean isUnion = isUnion(structType.getStructOrUnionType()); boolean hasSentinelUnionColumns = false; boolean hasNonSentinelUnionColumns = false; List<Type> convertedChildren = new ArrayList<Type>(); for (ThriftField child : structType.getChildren()) { State childState = new State(state.path.push(child), getRepetition(child), child.getName()); ConvertedField converted = child.getType().accept(this, childState); if (isUnion && !converted.isKeep()) { // user is not keeping this "kind" of union, but we still need // to keep at least one of the primitives of this union around. // in order to know what "kind" of union each record is. // TODO: in the future, we should just filter these records out instead // re-do the recursion, with a new projection filter that keeps only // the first primitive it encounters ConvertedField firstPrimitive = child .getType() .accept( new ThriftSchemaConvertVisitor(new KeepOnlyFirstPrimitiveFilter(), true), childState); convertedChildren.add(firstPrimitive.asKeep().getType().withId(child.getFieldId())); hasSentinelUnionColumns = true; } if (converted.isSentinelUnion()) { // child field is a sentinel union that we should drop if possible if (childState.repetition == REQUIRED) { // but this field is required, so we may still need it convertedChildren.add(converted.asSentinelUnion().getType().withId(child.getFieldId())); hasSentinelUnionColumns = true; } } else if (converted.isKeep()) { // user has selected this column, so we keep it. convertedChildren.add(converted.asKeep().getType().withId(child.getFieldId())); hasNonSentinelUnionColumns = true; } } if (!hasNonSentinelUnionColumns && hasSentinelUnionColumns) { // this is a union, and user has not requested any of the children // of this union. We should drop this union, if possible, but // we may not be able to, so tag this as a sentinel. return new SentinelUnion( state.path, new GroupType(state.repetition, state.name, convertedChildren)); } if (hasNonSentinelUnionColumns) { // user requested some of the fields of this struct, so we keep the struct return new Keep(state.path, new GroupType(state.repetition, state.name, convertedChildren)); } else { // user requested none of the fields of this struct, so we drop it return new Drop(state.path); } }
@Override public ConvertedField visit(MapType mapType, State state) { ThriftField keyField = mapType.getKey(); ThriftField valueField = mapType.getValue(); State keyState = new State(state.path.push(keyField), REQUIRED, "key"); // TODO: This is a bug! this should be REQUIRED but changing this will // break the the schema compatibility check against old data // Thrift does not support null / missing map values. State valueState = new State(state.path.push(valueField), OPTIONAL, "value"); ConvertedField convertedKey = keyField.getType().accept(this, keyState); ConvertedField convertedValue = valueField.getType().accept(this, valueState); if (!convertedKey.isKeep()) { if (convertedValue.isKeep()) { throw new ThriftProjectionException( "Cannot select only the values of a map, you must keep the keys as well: " + state.path); } // neither key nor value was requested return new Drop(state.path); } // we are keeping the key, but we do not allow partial projections on keys // as that doesn't make sense when assembling back into a map. // NOTE: doProjections prevents us from infinite recursion here. if (doProjection) { ConvertedField fullConvKey = keyField .getType() .accept( new ThriftSchemaConvertVisitor(FieldProjectionFilter.ALL_COLUMNS, false), keyState); if (!fullConvKey.asKeep().getType().equals(convertedKey.asKeep().getType())) { throw new ThriftProjectionException( "Cannot select only a subset of the fields in a map key, " + "for path " + state.path); } } // now, are we keeping the value? if (convertedValue.isKeep()) { // keep both key and value Type mapField = mapType( state.repetition, state.name, convertedKey.asKeep().getType(), convertedValue.asKeep().getType()); return new Keep(state.path, mapField); } // keep only the key, not the value ConvertedField sentinelValue = valueField .getType() .accept( new ThriftSchemaConvertVisitor(new KeepOnlyFirstPrimitiveFilter(), true), valueState); Type mapField = mapType( state.repetition, state.name, convertedKey.asKeep().getType(), sentinelValue.asKeep().getType()); // signals to mapType method to project the value return new Keep(state.path, mapField); }