private void handleSampling(DriverContext context, MapWork mWork, JobConf job, HiveConf conf) throws Exception { assert mWork.getAliasToWork().keySet().size() == 1; String alias = mWork.getAliases().get(0); Operator<?> topOp = mWork.getAliasToWork().get(alias); PartitionDesc partDesc = mWork.getAliasToPartnInfo().get(alias); ArrayList<String> paths = mWork.getPaths(); ArrayList<PartitionDesc> parts = mWork.getPartitionDescs(); List<Path> inputPaths = new ArrayList<Path>(paths.size()); for (String path : paths) { inputPaths.add(new Path(path)); } Path tmpPath = context.getCtx().getExternalTmpPath(inputPaths.get(0)); Path partitionFile = new Path(tmpPath, ".partitions"); ShimLoader.getHadoopShims().setTotalOrderPartitionFile(job, partitionFile); PartitionKeySampler sampler = new PartitionKeySampler(); if (mWork.getSamplingType() == MapWork.SAMPLING_ON_PREV_MR) { console.printInfo("Use sampling data created in previous MR"); // merges sampling data from previous MR and make partition keys for total sort for (Path path : inputPaths) { FileSystem fs = path.getFileSystem(job); for (FileStatus status : fs.globStatus(new Path(path, ".sampling*"))) { sampler.addSampleFile(status.getPath(), job); } } } else if (mWork.getSamplingType() == MapWork.SAMPLING_ON_START) { console.printInfo("Creating sampling data.."); assert topOp instanceof TableScanOperator; TableScanOperator ts = (TableScanOperator) topOp; FetchWork fetchWork; if (!partDesc.isPartitioned()) { assert paths.size() == 1; fetchWork = new FetchWork(inputPaths.get(0), partDesc.getTableDesc()); } else { fetchWork = new FetchWork(inputPaths, parts, partDesc.getTableDesc()); } fetchWork.setSource(ts); // random sampling FetchOperator fetcher = PartitionKeySampler.createSampler(fetchWork, conf, job, ts); try { ts.initialize(conf, new ObjectInspector[] {fetcher.getOutputObjectInspector()}); OperatorUtils.setChildrenCollector(ts.getChildOperators(), sampler); while (fetcher.pushRow()) {} } finally { fetcher.clearFetchContext(); } } else { throw new IllegalArgumentException("Invalid sampling type " + mWork.getSamplingType()); } sampler.writePartitionKeys(partitionFile, conf, job); }
/** * Traverse all the partitions for a table, and get the OI for the table. Note that a conversion * is required if any of the partition OI is different from the table OI. For eg. if the query * references table T (partitions P1, P2), and P1's schema is same as T, whereas P2's scheme is * different from T, conversion might be needed for both P1 and P2, since SettableOI might be * needed for T */ private Map<TableDesc, StructObjectInspector> getConvertedOI(Configuration hconf) throws HiveException { Map<TableDesc, StructObjectInspector> tableDescOI = new HashMap<TableDesc, StructObjectInspector>(); Set<TableDesc> identityConverterTableDesc = new HashSet<TableDesc>(); try { Map<ObjectInspector, Boolean> oiSettableProperties = new HashMap<ObjectInspector, Boolean>(); for (String onefile : conf.getPathToAliases().keySet()) { PartitionDesc pd = conf.getPathToPartitionInfo().get(onefile); TableDesc tableDesc = pd.getTableDesc(); Deserializer partDeserializer = pd.getDeserializer(hconf); StructObjectInspector partRawRowObjectInspector; if (Utilities.isInputFileFormatSelfDescribing(pd)) { Deserializer tblDeserializer = tableDesc.getDeserializer(hconf); partRawRowObjectInspector = (StructObjectInspector) tblDeserializer.getObjectInspector(); } else { partRawRowObjectInspector = (StructObjectInspector) partDeserializer.getObjectInspector(); } StructObjectInspector tblRawRowObjectInspector = tableDescOI.get(tableDesc); if ((tblRawRowObjectInspector == null) || (identityConverterTableDesc.contains(tableDesc))) { Deserializer tblDeserializer = tableDesc.getDeserializer(hconf); tblRawRowObjectInspector = (StructObjectInspector) ObjectInspectorConverters.getConvertedOI( partRawRowObjectInspector, tblDeserializer.getObjectInspector(), oiSettableProperties); if (identityConverterTableDesc.contains(tableDesc)) { if (!partRawRowObjectInspector.equals(tblRawRowObjectInspector)) { identityConverterTableDesc.remove(tableDesc); } } else if (partRawRowObjectInspector.equals(tblRawRowObjectInspector)) { identityConverterTableDesc.add(tableDesc); } tableDescOI.put(tableDesc, tblRawRowObjectInspector); } } } catch (Exception e) { throw new HiveException(e); } return tableDescOI; }
/* * This is the same as the setChildren method below but for empty tables. * It takes care of the following: * 1. Create the right object inspector. * 2. Set up the childrenOpToOI with the object inspector. * So as to ensure that the initialization happens correctly. */ public void initEmptyInputChildren(List<Operator<?>> children, Configuration hconf) throws SerDeException, Exception { setChildOperators(children); for (Operator<?> child : children) { TableScanOperator tsOp = (TableScanOperator) child; StructObjectInspector soi = null; PartitionDesc partDesc = conf.getAliasToPartnInfo().get(tsOp.getConf().getAlias()); Deserializer serde = partDesc.getTableDesc().getDeserializer(); partDesc.setProperties(partDesc.getProperties()); MapOpCtx opCtx = new MapOpCtx(tsOp.getConf().getAlias(), child, partDesc); StructObjectInspector tableRowOI = (StructObjectInspector) serde.getObjectInspector(); initObjectInspector(hconf, opCtx, tableRowOI); soi = opCtx.rowObjectInspector; child.getParentOperators().add(this); childrenOpToOI.put(child, soi); } }
public void setChildren(Configuration hconf) throws Exception { List<Operator<? extends OperatorDesc>> children = new ArrayList<Operator<? extends OperatorDesc>>(); Map<TableDesc, StructObjectInspector> convertedOI = getConvertedOI(hconf); for (Map.Entry<String, ArrayList<String>> entry : conf.getPathToAliases().entrySet()) { String onefile = entry.getKey(); List<String> aliases = entry.getValue(); PartitionDesc partDesc = conf.getPathToPartitionInfo().get(onefile); for (String alias : aliases) { Operator<? extends OperatorDesc> op = conf.getAliasToWork().get(alias); if (isLogDebugEnabled) { LOG.debug("Adding alias " + alias + " to work list for file " + onefile); } Map<Operator<?>, MapOpCtx> contexts = opCtxMap.get(onefile); if (contexts == null) { opCtxMap.put(onefile, contexts = new LinkedHashMap<Operator<?>, MapOpCtx>()); } if (contexts.containsKey(op)) { continue; } MapOpCtx context = new MapOpCtx(alias, op, partDesc); StructObjectInspector tableRowOI = convertedOI.get(partDesc.getTableDesc()); contexts.put(op, initObjectInspector(hconf, context, tableRowOI)); if (children.contains(op) == false) { op.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>(1)); op.getParentOperators().add(this); children.add(op); } } } initOperatorContext(children); // we found all the operators that we are supposed to process. setChildOperators(children); }
private MapOpCtx initObjectInspector( Configuration hconf, MapOpCtx opCtx, StructObjectInspector tableRowOI) throws Exception { PartitionDesc pd = opCtx.partDesc; TableDesc td = pd.getTableDesc(); // Use table properties in case of unpartitioned tables, // and the union of table properties and partition properties, with partition // taking precedence, in the case of partitioned tables Properties overlayedProps = SerDeUtils.createOverlayedProperties(td.getProperties(), pd.getProperties()); Map<String, String> partSpec = pd.getPartSpec(); opCtx.tableName = String.valueOf(overlayedProps.getProperty("name")); opCtx.partName = String.valueOf(partSpec); opCtx.deserializer = pd.getDeserializer(hconf); StructObjectInspector partRawRowObjectInspector; if (Utilities.isInputFileFormatSelfDescribing(pd)) { partRawRowObjectInspector = tableRowOI; } else { partRawRowObjectInspector = (StructObjectInspector) opCtx.deserializer.getObjectInspector(); } opCtx.partTblObjectInspectorConverter = ObjectInspectorConverters.getConverter(partRawRowObjectInspector, tableRowOI); // Next check if this table has partitions and if so // get the list of partition names as well as allocate // the serdes for the partition columns String pcols = overlayedProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS); if (pcols != null && pcols.length() > 0) { String[] partKeys = pcols.trim().split("/"); String pcolTypes = overlayedProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES); String[] partKeyTypes = pcolTypes.trim().split(":"); if (partKeys.length > partKeyTypes.length) { throw new HiveException( "Internal error : partKeys length, " + partKeys.length + " greater than partKeyTypes length, " + partKeyTypes.length); } List<String> partNames = new ArrayList<String>(partKeys.length); Object[] partValues = new Object[partKeys.length]; List<ObjectInspector> partObjectInspectors = new ArrayList<ObjectInspector>(partKeys.length); for (int i = 0; i < partKeys.length; i++) { String key = partKeys[i]; partNames.add(key); ObjectInspector oi = PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i])); // Partitions do not exist for this table if (partSpec == null) { // for partitionless table, initialize partValue to null partValues[i] = null; } else { partValues[i] = ObjectInspectorConverters.getConverter( PrimitiveObjectInspectorFactory.javaStringObjectInspector, oi) .convert(partSpec.get(key)); } partObjectInspectors.add(oi); } opCtx.rowWithPart = new Object[] {null, partValues}; opCtx.partObjectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(partNames, partObjectInspectors); } // The op may not be a TableScan for mapjoins // Consider the query: select /*+MAPJOIN(a)*/ count(*) FROM T1 a JOIN T2 b ON a.key = b.key; // In that case, it will be a Select, but the rowOI need not be amended if (opCtx.op instanceof TableScanOperator) { TableScanOperator tsOp = (TableScanOperator) opCtx.op; TableScanDesc tsDesc = tsOp.getConf(); if (tsDesc != null && tsDesc.hasVirtualCols()) { opCtx.vcs = tsDesc.getVirtualCols(); opCtx.vcValues = new Object[opCtx.vcs.size()]; opCtx.vcsObjectInspector = VirtualColumn.getVCSObjectInspector(opCtx.vcs); if (opCtx.isPartitioned()) { opCtx.rowWithPartAndVC = Arrays.copyOfRange(opCtx.rowWithPart, 0, 3); } else { opCtx.rowWithPartAndVC = new Object[2]; } } } if (!opCtx.hasVC() && !opCtx.isPartitioned()) { opCtx.rowObjectInspector = tableRowOI; return opCtx; } List<StructObjectInspector> inspectors = new ArrayList<StructObjectInspector>(); inspectors.add(tableRowOI); if (opCtx.isPartitioned()) { inspectors.add(opCtx.partObjectInspector); } if (opCtx.hasVC()) { inspectors.add(opCtx.vcsObjectInspector); } opCtx.rowObjectInspector = ObjectInspectorFactory.getUnionStructObjectInspector(inspectors); return opCtx; }