// TODO this has to find a better home, it's also hardcoded as default in hive would be nice
 // if the default was decided by the serde
 static void initializeOutputSerDe(SerDe serDe, Configuration conf, OutputJobInfo jobInfo)
     throws SerDeException {
   serDe.initialize(conf, getSerdeProperties(jobInfo.getTableInfo(), jobInfo.getOutputSchema()));
 }
  /**
   * Run a local map reduce job to load data from in memory records to an HCatalog Table
   *
   * @param partitionValues
   * @param partitionColumns
   * @param records data to be written to HCatalog table
   * @param writeCount
   * @param assertWrite
   * @param asSingleMapTask
   * @return
   * @throws Exception
   */
  Job runMRCreate(
      Map<String, String> partitionValues,
      List<HCatFieldSchema> partitionColumns,
      List<HCatRecord> records,
      int writeCount,
      boolean assertWrite,
      boolean asSingleMapTask,
      String customDynamicPathPattern)
      throws Exception {

    writeRecords = records;
    MapCreate.writeCount = 0;

    Configuration conf = new Configuration();
    Job job = new Job(conf, "hcat mapreduce write test");
    job.setJarByClass(this.getClass());
    job.setMapperClass(HCatMapReduceTest.MapCreate.class);

    // input/output settings
    job.setInputFormatClass(TextInputFormat.class);

    if (asSingleMapTask) {
      // One input path would mean only one map task
      Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput");
      createInputFile(path, writeCount);
      TextInputFormat.setInputPaths(job, path);
    } else {
      // Create two input paths so that two map tasks get triggered. There could be other ways
      // to trigger two map tasks.
      Path path = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput");
      createInputFile(path, writeCount / 2);

      Path path2 = new Path(fs.getWorkingDirectory(), "mapred/testHCatMapReduceInput2");
      createInputFile(path2, (writeCount - writeCount / 2));

      TextInputFormat.setInputPaths(job, path, path2);
    }

    job.setOutputFormatClass(HCatOutputFormat.class);

    OutputJobInfo outputJobInfo = OutputJobInfo.create(dbName, tableName, partitionValues);
    if (customDynamicPathPattern != null) {
      job.getConfiguration()
          .set(HCatConstants.HCAT_DYNAMIC_CUSTOM_PATTERN, customDynamicPathPattern);
    }
    HCatOutputFormat.setOutput(job, outputJobInfo);

    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(DefaultHCatRecord.class);

    job.setNumReduceTasks(0);

    HCatOutputFormat.setSchema(job, new HCatSchema(partitionColumns));

    boolean success = job.waitForCompletion(true);

    // Ensure counters are set when data has actually been read.
    if (partitionValues != null) {
      assertTrue(
          job.getCounters().getGroup("FileSystemCounters").findCounter("FILE_BYTES_READ").getValue()
              > 0);
    }

    if (!HCatUtil.isHadoop23()) {
      // Local mode outputcommitter hook is not invoked in Hadoop 1.x
      if (success) {
        new FileOutputCommitterContainer(job, null).commitJob(job);
      } else {
        new FileOutputCommitterContainer(job, null).abortJob(job, JobStatus.State.FAILED);
      }
    }
    if (assertWrite) {
      // we assert only if we expected to assert with this call.
      Assert.assertEquals(writeCount, MapCreate.writeCount);
    }

    if (isTableExternal()) {
      externalTableLocation = outputJobInfo.getTableInfo().getTableLocation();
    }

    return job;
  }
Example #3
0
  @Override
  public void configureOutputJobProperties(TableDesc tableDesc, Map<String, String> jobProperties) {
    try {
      OutputJobInfo jobInfo =
          (OutputJobInfo)
              HCatUtil.deserialize(
                  tableDesc.getJobProperties().get(HCatConstants.HCAT_KEY_OUTPUT_INFO));
      String parentPath = jobInfo.getTableInfo().getTableLocation();
      String dynHash = tableDesc.getJobProperties().get(HCatConstants.HCAT_DYNAMIC_PTN_JOBID);
      String idHash = tableDesc.getJobProperties().get(HCatConstants.HCAT_OUTPUT_ID_HASH);

      // For dynamic partitioned writes without all keyvalues specified,
      // we create a temp dir for the associated write job
      if (dynHash != null) {
        // if external table and custom root specified, update the parent path
        if (Boolean.valueOf((String) tableDesc.getProperties().get("EXTERNAL"))
            && jobInfo.getCustomDynamicRoot() != null
            && jobInfo.getCustomDynamicRoot().length() > 0) {
          parentPath = new Path(parentPath, jobInfo.getCustomDynamicRoot()).toString();
        }
        parentPath =
            new Path(parentPath, FileOutputCommitterContainer.DYNTEMP_DIR_NAME + dynHash)
                .toString();
      } else {
        parentPath =
            new Path(parentPath, FileOutputCommitterContainer.SCRATCH_DIR_NAME + idHash).toString();
      }

      String outputLocation;

      if ((dynHash != null)
          && Boolean.valueOf((String) tableDesc.getProperties().get("EXTERNAL"))
          && jobInfo.getCustomDynamicPath() != null
          && jobInfo.getCustomDynamicPath().length() > 0) {
        // dynamic partitioning with custom path; resolve the custom path
        // using partition column values
        outputLocation = HCatFileUtil.resolveCustomPath(jobInfo, null, true);
      } else if ((dynHash == null)
          && Boolean.valueOf((String) tableDesc.getProperties().get("EXTERNAL"))
          && jobInfo.getLocation() != null
          && jobInfo.getLocation().length() > 0) {
        // honor custom location for external table apart from what metadata specifies
        outputLocation = jobInfo.getLocation();
      } else if (dynHash == null && jobInfo.getPartitionValues().size() == 0) {
        // Unpartitioned table, writing to the scratch dir directly is good enough.
        outputLocation = "";
      } else {
        List<String> cols = new ArrayList<String>();
        List<String> values = new ArrayList<String>();

        // Get the output location in the order partition keys are defined for the table.
        for (String name : jobInfo.getTableInfo().getPartitionColumns().getFieldNames()) {
          String value = jobInfo.getPartitionValues().get(name);
          cols.add(name);
          values.add(value);
        }
        outputLocation = FileUtils.makePartName(cols, values);
      }

      if (outputLocation != null && !outputLocation.isEmpty()) {
        jobInfo.setLocation(new Path(parentPath, outputLocation).toString());
      } else {
        jobInfo.setLocation(new Path(parentPath).toString());
      }

      // only set output dir if partition is fully materialized
      if (jobInfo.getPartitionValues().size()
          == jobInfo.getTableInfo().getPartitionColumns().size()) {
        jobProperties.put("mapred.output.dir", jobInfo.getLocation());
      }

      SpecialCases.addSpecialCasesParametersToOutputJobProperties(jobProperties, jobInfo, ofClass);

      jobProperties.put(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(jobInfo));
    } catch (IOException e) {
      throw new IllegalStateException("Failed to set output path", e);
    }
  }
  /**
   * Simple test case.
   *
   * <ol>
   *   <li>Submits a mapred job which writes out one fixed line to each of the tables
   *   <li>uses hive fetch task to read the data and see if it matches what was written
   * </ol>
   *
   * @throws Exception if any error occurs
   */
  @Test
  public void testOutputFormat() throws Throwable {
    HashMap<String, String> partitionValues = new HashMap<String, String>();
    partitionValues.put("ds", "1");
    partitionValues.put("cluster", "ag");
    ArrayList<OutputJobInfo> infoList = new ArrayList<OutputJobInfo>();
    infoList.add(OutputJobInfo.create("default", tableNames[0], partitionValues));
    infoList.add(OutputJobInfo.create("default", tableNames[1], partitionValues));
    infoList.add(OutputJobInfo.create("default", tableNames[2], partitionValues));

    Job job = new Job(hiveConf, "SampleJob");

    job.setMapperClass(MyMapper.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(MultiOutputFormat.class);
    job.setNumReduceTasks(0);

    JobConfigurer configurer = MultiOutputFormat.createConfigurer(job);

    for (int i = 0; i < tableNames.length; i++) {
      configurer.addOutputFormat(
          tableNames[i], HCatOutputFormat.class, BytesWritable.class, HCatRecord.class);
      HCatOutputFormat.setOutput(configurer.getJob(tableNames[i]), infoList.get(i));
      HCatOutputFormat.setSchema(configurer.getJob(tableNames[i]), schemaMap.get(tableNames[i]));
    }
    configurer.configure();

    Path filePath = createInputFile();
    FileInputFormat.addInputPath(job, filePath);
    Assert.assertTrue(job.waitForCompletion(true));

    ArrayList<String> outputs = new ArrayList<String>();
    for (String tbl : tableNames) {
      outputs.add(getTableData(tbl, "default").get(0));
    }
    Assert.assertEquals(
        "Comparing output of table " + tableNames[0] + " is not correct",
        outputs.get(0),
        "a,a,1,ag");
    Assert.assertEquals(
        "Comparing output of table " + tableNames[1] + " is not correct", outputs.get(1), "a,1,ag");
    Assert.assertEquals(
        "Comparing output of table " + tableNames[2] + " is not correct",
        outputs.get(2),
        "a,a,extra,1,ag");

    // Check permisssion on partition dirs and files created
    for (int i = 0; i < tableNames.length; i++) {
      Path partitionFile =
          new Path(warehousedir + "/" + tableNames[i] + "/ds=1/cluster=ag/part-m-00000");
      FileSystem fs = partitionFile.getFileSystem(mrConf);
      Assert.assertEquals(
          "File permissions of table " + tableNames[i] + " is not correct",
          fs.getFileStatus(partitionFile).getPermission(),
          new FsPermission(tablePerms[i]));
      Assert.assertEquals(
          "File permissions of table " + tableNames[i] + " is not correct",
          fs.getFileStatus(partitionFile.getParent()).getPermission(),
          new FsPermission(tablePerms[i]));
      Assert.assertEquals(
          "File permissions of table " + tableNames[i] + " is not correct",
          fs.getFileStatus(partitionFile.getParent().getParent()).getPermission(),
          new FsPermission(tablePerms[i]));
    }
    LOG.info("File permissions verified");
  }