/** * Method to fetch table data * * @param table table name * @param database database * @return list of columns in comma seperated way * @throws Exception if any error occurs */ private List<String> getTableData(String table, String database) throws Exception { HiveConf conf = new HiveConf(); conf.addResource("hive-site.xml"); ArrayList<String> results = new ArrayList<String>(); ArrayList<String> temp = new ArrayList<String>(); Hive hive = Hive.get(conf); org.apache.hadoop.hive.ql.metadata.Table tbl = hive.getTable(database, table); FetchWork work; if (!tbl.getPartCols().isEmpty()) { List<Partition> partitions = hive.getPartitions(tbl); List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>(); List<String> partLocs = new ArrayList<String>(); for (Partition part : partitions) { partLocs.add(part.getLocation()); partDesc.add(Utilities.getPartitionDesc(part)); } work = new FetchWork(partLocs, partDesc, Utilities.getTableDesc(tbl)); work.setLimit(100); } else { work = new FetchWork(tbl.getDataLocation().toString(), Utilities.getTableDesc(tbl)); } FetchTask task = new FetchTask(); task.setWork(work); task.initialize(conf, null, null); task.fetch(temp); for (String str : temp) { results.add(str.replace("\t", ",")); } return results; }
private void handleSampling(DriverContext context, MapWork mWork, JobConf job, HiveConf conf) throws Exception { assert mWork.getAliasToWork().keySet().size() == 1; String alias = mWork.getAliases().get(0); Operator<?> topOp = mWork.getAliasToWork().get(alias); PartitionDesc partDesc = mWork.getAliasToPartnInfo().get(alias); ArrayList<String> paths = mWork.getPaths(); ArrayList<PartitionDesc> parts = mWork.getPartitionDescs(); List<Path> inputPaths = new ArrayList<Path>(paths.size()); for (String path : paths) { inputPaths.add(new Path(path)); } Path tmpPath = context.getCtx().getExternalTmpPath(inputPaths.get(0)); Path partitionFile = new Path(tmpPath, ".partitions"); ShimLoader.getHadoopShims().setTotalOrderPartitionFile(job, partitionFile); PartitionKeySampler sampler = new PartitionKeySampler(); if (mWork.getSamplingType() == MapWork.SAMPLING_ON_PREV_MR) { console.printInfo("Use sampling data created in previous MR"); // merges sampling data from previous MR and make partition keys for total sort for (Path path : inputPaths) { FileSystem fs = path.getFileSystem(job); for (FileStatus status : fs.globStatus(new Path(path, ".sampling*"))) { sampler.addSampleFile(status.getPath(), job); } } } else if (mWork.getSamplingType() == MapWork.SAMPLING_ON_START) { console.printInfo("Creating sampling data.."); assert topOp instanceof TableScanOperator; TableScanOperator ts = (TableScanOperator) topOp; FetchWork fetchWork; if (!partDesc.isPartitioned()) { assert paths.size() == 1; fetchWork = new FetchWork(inputPaths.get(0), partDesc.getTableDesc()); } else { fetchWork = new FetchWork(inputPaths, parts, partDesc.getTableDesc()); } fetchWork.setSource(ts); // random sampling FetchOperator fetcher = PartitionKeySampler.createSampler(fetchWork, conf, job, ts); try { ts.initialize(conf, new ObjectInspector[] {fetcher.getOutputObjectInspector()}); OperatorUtils.setChildrenCollector(ts.getChildOperators(), sampler); while (fetcher.pushRow()) {} } finally { fetcher.clearFetchContext(); } } else { throw new IllegalArgumentException("Invalid sampling type " + mWork.getSamplingType()); } sampler.writePartitionKeys(partitionFile, conf, job); }
/** Get the result schema and misc metadata, in the context of SELECT. */ public synchronized ResultsMetadata getResultMetadata() { Schema schema = null; try { schema = driver.getSchema(); } catch (Exception ex) { LOG.error("Error getting schema for query: " + query.query, ex); } FetchWork work = getFetchWork(); TableDesc desc = work.getTblDesc(); String tabledir = null; String tablename = null; String sep = null; if (work != null) { tabledir = work.getTblDir(); } if (desc != null) { sep = desc.getProperties() .getProperty(Constants.SERIALIZATION_FORMAT, "" + Utilities.ctrlaCode); tablename = desc.getTableName(); } return new ResultsMetadata(schema, tabledir, tablename, sep); }
/** * set the current task in the mapredWork. * * @param alias_id current alias * @param topOp the top operator of the stack * @param plan current plan * @param local whether you need to add to map-reduce or local work * @param opProcCtx processing context * @param pList pruned partition list. If it is null it will be computed on-the-fly. */ public static void setTaskPlan( String alias_id, Operator<? extends Serializable> topOp, MapredWork plan, boolean local, GenMRProcContext opProcCtx, PrunedPartitionList pList) throws SemanticException { ParseContext parseCtx = opProcCtx.getParseCtx(); Set<ReadEntity> inputs = opProcCtx.getInputs(); ArrayList<Path> partDir = new ArrayList<Path>(); ArrayList<PartitionDesc> partDesc = new ArrayList<PartitionDesc>(); Path tblDir = null; TableDesc tblDesc = null; PrunedPartitionList partsList = pList; if (partsList == null) { try { partsList = parseCtx.getOpToPartList().get((TableScanOperator) topOp); if (partsList == null) { partsList = PartitionPruner.prune( parseCtx.getTopToTable().get(topOp), parseCtx.getOpToPartPruner().get(topOp), opProcCtx.getConf(), alias_id, parseCtx.getPrunedPartitions()); parseCtx.getOpToPartList().put((TableScanOperator) topOp, partsList); } } catch (SemanticException e) { throw e; } catch (HiveException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); throw new SemanticException(e.getMessage(), e); } } // Generate the map work for this alias_id Set<Partition> parts = null; // pass both confirmed and unknown partitions through the map-reduce // framework parts = partsList.getConfirmedPartns(); parts.addAll(partsList.getUnknownPartns()); PartitionDesc aliasPartnDesc = null; try { if (!parts.isEmpty()) { aliasPartnDesc = Utilities.getPartitionDesc(parts.iterator().next()); } } catch (HiveException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); throw new SemanticException(e.getMessage(), e); } // The table does not have any partitions if (aliasPartnDesc == null) { aliasPartnDesc = new PartitionDesc(Utilities.getTableDesc(parseCtx.getTopToTable().get(topOp)), null); } plan.getAliasToPartnInfo().put(alias_id, aliasPartnDesc); for (Partition part : parts) { if (part.getTable().isPartitioned()) { inputs.add(new ReadEntity(part)); } else { inputs.add(new ReadEntity(part.getTable())); } // Later the properties have to come from the partition as opposed // to from the table in order to support versioning. Path[] paths; sampleDesc sampleDescr = parseCtx.getOpToSamplePruner().get(topOp); if (sampleDescr != null) { paths = SamplePruner.prune(part, sampleDescr); } else { paths = part.getPath(); } // is it a partitioned table ? if (!part.getTable().isPartitioned()) { assert ((tblDir == null) && (tblDesc == null)); tblDir = paths[0]; tblDesc = Utilities.getTableDesc(part.getTable()); } for (Path p : paths) { if (p == null) { continue; } String path = p.toString(); if (LOG.isDebugEnabled()) { LOG.debug("Adding " + path + " of table" + alias_id); } partDir.add(p); try { partDesc.add(Utilities.getPartitionDesc(part)); } catch (HiveException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); throw new SemanticException(e.getMessage(), e); } } } Iterator<Path> iterPath = partDir.iterator(); Iterator<PartitionDesc> iterPartnDesc = partDesc.iterator(); if (!local) { while (iterPath.hasNext()) { assert iterPartnDesc.hasNext(); String path = iterPath.next().toString(); PartitionDesc prtDesc = iterPartnDesc.next(); // Add the path to alias mapping if (plan.getPathToAliases().get(path) == null) { plan.getPathToAliases().put(path, new ArrayList<String>()); } plan.getPathToAliases().get(path).add(alias_id); plan.getPathToPartitionInfo().put(path, prtDesc); if (LOG.isDebugEnabled()) { LOG.debug("Information added for path " + path); } } assert plan.getAliasToWork().get(alias_id) == null; plan.getAliasToWork().put(alias_id, topOp); } else { // populate local work if needed MapredLocalWork localPlan = plan.getMapLocalWork(); if (localPlan == null) { localPlan = new MapredLocalWork( new LinkedHashMap<String, Operator<? extends Serializable>>(), new LinkedHashMap<String, FetchWork>()); } assert localPlan.getAliasToWork().get(alias_id) == null; assert localPlan.getAliasToFetchWork().get(alias_id) == null; localPlan.getAliasToWork().put(alias_id, topOp); if (tblDir == null) { localPlan .getAliasToFetchWork() .put(alias_id, new FetchWork(FetchWork.convertPathToStringArray(partDir), partDesc)); } else { localPlan.getAliasToFetchWork().put(alias_id, new FetchWork(tblDir.toString(), tblDesc)); } plan.setMapLocalWork(localPlan); } }
private void outputPlan(Task<? extends Serializable> task) { if (task == null) return; out.printf("Stage: \n", task.getId()); // real output Serializable work = task.getWork(); if (work == null) return; if (work instanceof FetchWork) { out.println("Fetch"); output(((FetchWork) work).getSource()); } else if (work instanceof MapredLocalWork) { out.println("MapredLocalWork"); // fetch try { out.println("Fetch Part"); Collection<FetchWork> fetchWorkCollect = ((MapredLocalWork) work).getAliasToFetchWork().values(); for (FetchWork f : fetchWorkCollect) { output(f.getSource()); } } catch (Exception e) { out.println("Exception 1"); } // others try { out.println("Other Parts"); Collection<Operator<? extends OperatorDesc>> collect = ((MapredLocalWork) work).getAliasToWork().values(); for (Operator<? extends OperatorDesc> c : collect) { output(c); } } catch (Exception e) { out.println("Exception 2"); } } else if (work instanceof MapredWork) { out.println("MapredWork"); try { Collection<Operator<? extends OperatorDesc>> collect = ((MapredWork) work).getAllOperators(); for (Operator<? extends OperatorDesc> c : collect) { // out.println(1); output(c); break; // first operator will give out all info s } } catch (Exception e) { out.println("Exception 3"); } } else { output(work); } // -------other cases-------------------- if (task instanceof ConditionalTask && ((ConditionalTask) task).getListTasks() != null) { for (Task<? extends Serializable> con : ((ConditionalTask) task).getListTasks()) { outputPlan(con); } } if (task.getChildTasks() != null) { for (Task<? extends Serializable> child : task.getChildTasks()) { outputPlan(child); } } }