private int getLastIdx() { Set<Integer> indices = Sets.newHashSet(); for (Type type : footer.getTypesList()) { indices.addAll(type.getSubtypesList()); } return Collections.max(indices); }
private long getRawDataSizeOfColumn(int colIdx) { OrcProto.ColumnStatistics colStat = footer.getStatistics(colIdx); long numVals = colStat.getNumberOfValues(); Type type = footer.getTypes(colIdx); switch (type.getKind()) { case BINARY: // old orc format doesn't support binary statistics. checking for binary // statistics is not required as protocol buffers takes care of it. return colStat.getBinaryStatistics().getSum(); case STRING: case CHAR: case VARCHAR: // old orc format doesn't support sum for string statistics. checking for // existence is not required as protocol buffers takes care of it. // ORC strings are deserialized to java strings. so use java data model's // string size numVals = numVals == 0 ? 1 : numVals; int avgStrLen = (int) (colStat.getStringStatistics().getSum() / numVals); return numVals * JavaDataModel.get().lengthForStringOfLength(avgStrLen); case TIMESTAMP: return numVals * JavaDataModel.get().lengthOfTimestamp(); case DATE: return numVals * JavaDataModel.get().lengthOfDate(); case DECIMAL: return numVals * JavaDataModel.get().lengthOfDecimal(); case DOUBLE: case LONG: return numVals * JavaDataModel.get().primitive2(); case FLOAT: case INT: case SHORT: case BOOLEAN: case BYTE: return numVals * JavaDataModel.get().primitive1(); default: LOG.debug("Unknown primitive category."); break; } return 0; }
private List<Integer> getColumnIndicesFromNames(List<String> colNames) { // top level struct Type type = footer.getTypesList().get(0); List<Integer> colIndices = Lists.newArrayList(); List<String> fieldNames = type.getFieldNamesList(); int fieldIdx = 0; for (String colName : colNames) { if (fieldNames.contains(colName)) { fieldIdx = fieldNames.indexOf(colName); } // a single field may span multiple columns. find start and end column // index for the requested field int idxStart = type.getSubtypes(fieldIdx); int idxEnd; // if the specified is the last field and then end index will be last // column index if (fieldIdx + 1 > fieldNames.size() - 1) { idxEnd = getLastIdx() + 1; } else { idxEnd = type.getSubtypes(fieldIdx + 1); } // if start index and end index are same then the field is a primitive // field else complex field (like map, list, struct, union) if (idxStart == idxEnd) { // simple field colIndices.add(idxStart); } else { // complex fields spans multiple columns for (int i = idxStart; i < idxEnd; i++) { colIndices.add(i); } } } return colIndices; }