// TODO this has to find a better home, it's also hardcoded as default in hive would be nice // if the default was decided by the serde static void initializeOutputSerDe(SerDe serDe, Configuration conf, OutputJobInfo jobInfo) throws SerDeException { serDe.initialize(conf, getSerdeProperties(jobInfo.getTableInfo(), jobInfo.getOutputSchema())); }
/** * Run a local map reduce job to load data from in memory records to an HCatalog Table * * @param partitionValues * @param partitionColumns * @param records data to be written to HCatalog table * @param writeCount * @param assertWrite * @param asSingleMapTask * @return * @throws Exception */ Job runMRCreate( Map<String, String> partitionValues, List<HCatFieldSchema> partitionColumns, List<HCatRecord> records, int writeCount, boolean assertWrite, boolean asSingleMapTask, String customDynamicPathPattern) throws Exception { writeRecords = records; MapCreate.writeCount = 0; Configuration conf = new Configuration(); Job job = new Job(conf, "hcat mapreduce write test"); job.setJarByClass(this.getClass()); job.setMapperClass(HCatMapReduceTest.MapCreate.class); // input/output settings job.setInputFormatClass(TextInputFormat.class); if (asSingleMapTask) { // One input path would mean only one map task Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput"); createInputFile(path, writeCount); TextInputFormat.setInputPaths(job, path); } else { // Create two input paths so that two map tasks get triggered. There could be other ways // to trigger two map tasks. Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput"); createInputFile(path, writeCount / 2); Path path2 = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput2"); createInputFile(path2, (writeCount - writeCount / 2)); TextInputFormat.setInputPaths(job, path, path2); } job.setOutputFormatClass(HCatOutputFormat.class); OutputJobInfo outputJobInfo = OutputJobInfo.create(dbName, tableName, partitionValues); if (customDynamicPathPattern != null) { job.getConfiguration() .set(HCatConstants.HCAT_DYNAMIC_CUSTOM_PATTERN, customDynamicPathPattern); } HCatOutputFormat.setOutput(job, outputJobInfo); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(DefaultHCatRecord.class); job.setNumReduceTasks(0); HCatOutputFormat.setSchema(job, new HCatSchema(partitionColumns)); boolean success = job.waitForCompletion(true); // Ensure counters are set when data has actually been read. if (partitionValues != null) { assertTrue( job.getCounters().getGroup("FileSystemCounters").findCounter("FILE_BYTES_READ").getValue() > 0); } if (!HCatUtil.isHadoop23()) { // Local mode outputcommitter hook is not invoked in Hadoop 1.x if (success) { new FileOutputCommitterContainer(job, null).commitJob(job); } else { new FileOutputCommitterContainer(job, null).abortJob(job, JobStatus.State.FAILED); } } if (assertWrite) { // we assert only if we expected to assert with this call. Assert.assertEquals(writeCount, MapCreate.writeCount); } if (isTableExternal()) { externalTableLocation = outputJobInfo.getTableInfo().getTableLocation(); } return job; }
@Override public void configureOutputJobProperties(TableDesc tableDesc, Map<String, String> jobProperties) { try { OutputJobInfo jobInfo = (OutputJobInfo) HCatUtil.deserialize( tableDesc.getJobProperties().get(HCatConstants.HCAT_KEY_OUTPUT_INFO)); String parentPath = jobInfo.getTableInfo().getTableLocation(); String dynHash = tableDesc.getJobProperties().get(HCatConstants.HCAT_DYNAMIC_PTN_JOBID); String idHash = tableDesc.getJobProperties().get(HCatConstants.HCAT_OUTPUT_ID_HASH); // For dynamic partitioned writes without all keyvalues specified, // we create a temp dir for the associated write job if (dynHash != null) { // if external table and custom root specified, update the parent path if (Boolean.valueOf((String) tableDesc.getProperties().get("EXTERNAL")) && jobInfo.getCustomDynamicRoot() != null && jobInfo.getCustomDynamicRoot().length() > 0) { parentPath = new Path(parentPath, jobInfo.getCustomDynamicRoot()).toString(); } parentPath = new Path(parentPath, FileOutputCommitterContainer.DYNTEMP_DIR_NAME + dynHash) .toString(); } else { parentPath = new Path(parentPath, FileOutputCommitterContainer.SCRATCH_DIR_NAME + idHash).toString(); } String outputLocation; if ((dynHash != null) && Boolean.valueOf((String) tableDesc.getProperties().get("EXTERNAL")) && jobInfo.getCustomDynamicPath() != null && jobInfo.getCustomDynamicPath().length() > 0) { // dynamic partitioning with custom path; resolve the custom path // using partition column values outputLocation = HCatFileUtil.resolveCustomPath(jobInfo, null, true); } else if ((dynHash == null) && Boolean.valueOf((String) tableDesc.getProperties().get("EXTERNAL")) && jobInfo.getLocation() != null && jobInfo.getLocation().length() > 0) { // honor custom location for external table apart from what metadata specifies outputLocation = jobInfo.getLocation(); } else if (dynHash == null && jobInfo.getPartitionValues().size() == 0) { // Unpartitioned table, writing to the scratch dir directly is good enough. outputLocation = ""; } else { List<String> cols = new ArrayList<String>(); List<String> values = new ArrayList<String>(); // Get the output location in the order partition keys are defined for the table. for (String name : jobInfo.getTableInfo().getPartitionColumns().getFieldNames()) { String value = jobInfo.getPartitionValues().get(name); cols.add(name); values.add(value); } outputLocation = FileUtils.makePartName(cols, values); } if (outputLocation != null && !outputLocation.isEmpty()) { jobInfo.setLocation(new Path(parentPath, outputLocation).toString()); } else { jobInfo.setLocation(new Path(parentPath).toString()); } // only set output dir if partition is fully materialized if (jobInfo.getPartitionValues().size() == jobInfo.getTableInfo().getPartitionColumns().size()) { jobProperties.put("mapred.output.dir", jobInfo.getLocation()); } SpecialCases.addSpecialCasesParametersToOutputJobProperties(jobProperties, jobInfo, ofClass); jobProperties.put(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(jobInfo)); } catch (IOException e) { throw new IllegalStateException("Failed to set output path", e); } }
/** * Simple test case. * * <ol> * <li>Submits a mapred job which writes out one fixed line to each of the tables * <li>uses hive fetch task to read the data and see if it matches what was written * </ol> * * @throws Exception if any error occurs */ @Test public void testOutputFormat() throws Throwable { HashMap<String, String> partitionValues = new HashMap<String, String>(); partitionValues.put("ds", "1"); partitionValues.put("cluster", "ag"); ArrayList<OutputJobInfo> infoList = new ArrayList<OutputJobInfo>(); infoList.add(OutputJobInfo.create("default", tableNames[0], partitionValues)); infoList.add(OutputJobInfo.create("default", tableNames[1], partitionValues)); infoList.add(OutputJobInfo.create("default", tableNames[2], partitionValues)); Job job = new Job(hiveConf, "SampleJob"); job.setMapperClass(MyMapper.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(MultiOutputFormat.class); job.setNumReduceTasks(0); JobConfigurer configurer = MultiOutputFormat.createConfigurer(job); for (int i = 0; i < tableNames.length; i++) { configurer.addOutputFormat( tableNames[i], HCatOutputFormat.class, BytesWritable.class, HCatRecord.class); HCatOutputFormat.setOutput(configurer.getJob(tableNames[i]), infoList.get(i)); HCatOutputFormat.setSchema(configurer.getJob(tableNames[i]), schemaMap.get(tableNames[i])); } configurer.configure(); Path filePath = createInputFile(); FileInputFormat.addInputPath(job, filePath); Assert.assertTrue(job.waitForCompletion(true)); ArrayList<String> outputs = new ArrayList<String>(); for (String tbl : tableNames) { outputs.add(getTableData(tbl, "default").get(0)); } Assert.assertEquals( "Comparing output of table " + tableNames[0] + " is not correct", outputs.get(0), "a,a,1,ag"); Assert.assertEquals( "Comparing output of table " + tableNames[1] + " is not correct", outputs.get(1), "a,1,ag"); Assert.assertEquals( "Comparing output of table " + tableNames[2] + " is not correct", outputs.get(2), "a,a,extra,1,ag"); // Check permisssion on partition dirs and files created for (int i = 0; i < tableNames.length; i++) { Path partitionFile = new Path(warehousedir + "/" + tableNames[i] + "/ds=1/cluster=ag/part-m-00000"); FileSystem fs = partitionFile.getFileSystem(mrConf); Assert.assertEquals( "File permissions of table " + tableNames[i] + " is not correct", fs.getFileStatus(partitionFile).getPermission(), new FsPermission(tablePerms[i])); Assert.assertEquals( "File permissions of table " + tableNames[i] + " is not correct", fs.getFileStatus(partitionFile.getParent()).getPermission(), new FsPermission(tablePerms[i])); Assert.assertEquals( "File permissions of table " + tableNames[i] + " is not correct", fs.getFileStatus(partitionFile.getParent().getParent()).getPermission(), new FsPermission(tablePerms[i])); } LOG.info("File permissions verified"); }