@Test public void testNumSamplesOneFile() { try { final String tempFile = TestFileUtils.createTempFile(TEST_DATA1); final Configuration conf = new Configuration(); final TestDelimitedInputFormat format = new TestDelimitedInputFormat(); format.setFilePath(tempFile.replace("file", "test")); format.configure(conf); TestFileSystem.resetStreamOpenCounter(); format.getStatistics(null); Assert.assertEquals( "Wrong number of samples taken.", DEFAULT_NUM_SAMPLES, TestFileSystem.getNumtimeStreamOpened()); TestDelimitedInputFormat format2 = new TestDelimitedInputFormat(); format2.setFilePath(tempFile.replace("file", "test")); format2.setNumLineSamples(8); format2.configure(conf); TestFileSystem.resetStreamOpenCounter(); format2.getStatistics(null); Assert.assertEquals( "Wrong number of samples taken.", 8, TestFileSystem.getNumtimeStreamOpened()); } catch (Exception e) { e.printStackTrace(); Assert.fail(e.getMessage()); } }
@Test public void testCachedStatistics() { try { final String tempFile = TestFileUtils.createTempFile(TEST_DATA1); final Configuration conf = new Configuration(); final TestDelimitedInputFormat format = new TestDelimitedInputFormat(); format.setFilePath("test://" + tempFile); format.configure(conf); TestFileSystem.resetStreamOpenCounter(); BaseStatistics stats = format.getStatistics(null); Assert.assertEquals( "Wrong number of samples taken.", DEFAULT_NUM_SAMPLES, TestFileSystem.getNumtimeStreamOpened()); final TestDelimitedInputFormat format2 = new TestDelimitedInputFormat(); format2.setFilePath("test://" + tempFile); format2.configure(conf); TestFileSystem.resetStreamOpenCounter(); BaseStatistics stats2 = format2.getStatistics(stats); Assert.assertTrue( "Using cached statistics should cicumvent sampling.", 0 == TestFileSystem.getNumtimeStreamOpened()); Assert.assertTrue("Using cached statistics should cicumvent sampling.", stats == stats2); } catch (Exception e) { e.printStackTrace(); Assert.fail(e.getMessage()); } }
@Test public void testDifferentDelimiter() { try { final String DELIMITER = "12345678-"; String testData = TEST_DATA1.replace("\n", DELIMITER); final String tempFile = TestFileUtils.createTempFile(testData); final Configuration conf = new Configuration(); final TestDelimitedInputFormat format = new TestDelimitedInputFormat(); format.setFilePath(tempFile); format.setDelimiter(DELIMITER); format.configure(conf); BaseStatistics stats = format.getStatistics(null); final int numLines = TEST_DATA_1_LINES; final float avgWidth = ((float) testData.length()) / TEST_DATA_1_LINES; Assert.assertTrue( "Wrong record count.", stats.getNumberOfRecords() < numLines + 1 & stats.getNumberOfRecords() > numLines - 1); Assert.assertTrue( "Wrong avg record size.", stats.getAverageRecordWidth() < avgWidth + 1 & stats.getAverageRecordWidth() > avgWidth - 1); } catch (Exception e) { e.printStackTrace(); Assert.fail(e.getMessage()); } }
@Test public void testSamplingOverlyLongRecord() { try { final String tempFile = TestFileUtils.createTempFile( 2 * PactConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_SAMPLE_LEN); final Configuration conf = new Configuration(); final TestDelimitedInputFormat format = new TestDelimitedInputFormat(); format.setFilePath(tempFile); format.configure(conf); Assert.assertNull( "Expected exception due to overly long record.", format.getStatistics(null)); } catch (Exception e) { e.printStackTrace(); Assert.fail(e.getMessage()); } }