/** * Method to fetch table data * * @param table table name * @param database database * @return list of columns in comma seperated way * @throws Exception if any error occurs */ private List<String> getTableData(String table, String database) throws Exception { HiveConf conf = new HiveConf(); conf.addResource("hive-site.xml"); ArrayList<String> results = new ArrayList<String>(); ArrayList<String> temp = new ArrayList<String>(); Hive hive = Hive.get(conf); org.apache.hadoop.hive.ql.metadata.Table tbl = hive.getTable(database, table); FetchWork work; if (!tbl.getPartCols().isEmpty()) { List<Partition> partitions = hive.getPartitions(tbl); List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>(); List<String> partLocs = new ArrayList<String>(); for (Partition part : partitions) { partLocs.add(part.getLocation()); partDesc.add(Utilities.getPartitionDesc(part)); } work = new FetchWork(partLocs, partDesc, Utilities.getTableDesc(tbl)); work.setLimit(100); } else { work = new FetchWork(tbl.getDataLocation().toString(), Utilities.getTableDesc(tbl)); } FetchTask task = new FetchTask(); task.setWork(work); task.initialize(conf, null, null); task.fetch(temp); for (String str : temp) { results.add(str.replace("\t", ",")); } return results; }
/** * set the current task in the mapredWork. * * @param alias_id current alias * @param topOp the top operator of the stack * @param plan current plan * @param local whether you need to add to map-reduce or local work * @param opProcCtx processing context * @param pList pruned partition list. If it is null it will be computed on-the-fly. */ public static void setTaskPlan( String alias_id, Operator<? extends Serializable> topOp, MapredWork plan, boolean local, GenMRProcContext opProcCtx, PrunedPartitionList pList) throws SemanticException { ParseContext parseCtx = opProcCtx.getParseCtx(); Set<ReadEntity> inputs = opProcCtx.getInputs(); ArrayList<Path> partDir = new ArrayList<Path>(); ArrayList<PartitionDesc> partDesc = new ArrayList<PartitionDesc>(); Path tblDir = null; TableDesc tblDesc = null; PrunedPartitionList partsList = pList; if (partsList == null) { try { partsList = parseCtx.getOpToPartList().get((TableScanOperator) topOp); if (partsList == null) { partsList = PartitionPruner.prune( parseCtx.getTopToTable().get(topOp), parseCtx.getOpToPartPruner().get(topOp), opProcCtx.getConf(), alias_id, parseCtx.getPrunedPartitions()); parseCtx.getOpToPartList().put((TableScanOperator) topOp, partsList); } } catch (SemanticException e) { throw e; } catch (HiveException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); throw new SemanticException(e.getMessage(), e); } } // Generate the map work for this alias_id Set<Partition> parts = null; // pass both confirmed and unknown partitions through the map-reduce // framework parts = partsList.getConfirmedPartns(); parts.addAll(partsList.getUnknownPartns()); PartitionDesc aliasPartnDesc = null; try { if (!parts.isEmpty()) { aliasPartnDesc = Utilities.getPartitionDesc(parts.iterator().next()); } } catch (HiveException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); throw new SemanticException(e.getMessage(), e); } // The table does not have any partitions if (aliasPartnDesc == null) { aliasPartnDesc = new PartitionDesc(Utilities.getTableDesc(parseCtx.getTopToTable().get(topOp)), null); } plan.getAliasToPartnInfo().put(alias_id, aliasPartnDesc); for (Partition part : parts) { if (part.getTable().isPartitioned()) { inputs.add(new ReadEntity(part)); } else { inputs.add(new ReadEntity(part.getTable())); } // Later the properties have to come from the partition as opposed // to from the table in order to support versioning. Path[] paths; sampleDesc sampleDescr = parseCtx.getOpToSamplePruner().get(topOp); if (sampleDescr != null) { paths = SamplePruner.prune(part, sampleDescr); } else { paths = part.getPath(); } // is it a partitioned table ? if (!part.getTable().isPartitioned()) { assert ((tblDir == null) && (tblDesc == null)); tblDir = paths[0]; tblDesc = Utilities.getTableDesc(part.getTable()); } for (Path p : paths) { if (p == null) { continue; } String path = p.toString(); if (LOG.isDebugEnabled()) { LOG.debug("Adding " + path + " of table" + alias_id); } partDir.add(p); try { partDesc.add(Utilities.getPartitionDesc(part)); } catch (HiveException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); throw new SemanticException(e.getMessage(), e); } } } Iterator<Path> iterPath = partDir.iterator(); Iterator<PartitionDesc> iterPartnDesc = partDesc.iterator(); if (!local) { while (iterPath.hasNext()) { assert iterPartnDesc.hasNext(); String path = iterPath.next().toString(); PartitionDesc prtDesc = iterPartnDesc.next(); // Add the path to alias mapping if (plan.getPathToAliases().get(path) == null) { plan.getPathToAliases().put(path, new ArrayList<String>()); } plan.getPathToAliases().get(path).add(alias_id); plan.getPathToPartitionInfo().put(path, prtDesc); if (LOG.isDebugEnabled()) { LOG.debug("Information added for path " + path); } } assert plan.getAliasToWork().get(alias_id) == null; plan.getAliasToWork().put(alias_id, topOp); } else { // populate local work if needed MapredLocalWork localPlan = plan.getMapLocalWork(); if (localPlan == null) { localPlan = new MapredLocalWork( new LinkedHashMap<String, Operator<? extends Serializable>>(), new LinkedHashMap<String, FetchWork>()); } assert localPlan.getAliasToWork().get(alias_id) == null; assert localPlan.getAliasToFetchWork().get(alias_id) == null; localPlan.getAliasToWork().put(alias_id, topOp); if (tblDir == null) { localPlan .getAliasToFetchWork() .put(alias_id, new FetchWork(FetchWork.convertPathToStringArray(partDir), partDesc)); } else { localPlan.getAliasToFetchWork().put(alias_id, new FetchWork(tblDir.toString(), tblDesc)); } plan.setMapLocalWork(localPlan); } }