/** * Test to make sure that chunking does not change the results. * * @throws ClassNotFoundException * @throws IllegalAccessException * @throws InstantiationException */ @Test public void testWithoutChunking() throws InstantiationException, IllegalAccessException, ClassNotFoundException { // TODO: this test is basically encapsulated in testWithChunking... so maybe get rid of this? Chunker.shouldChunkTrainDocs(false); ProblemSet ps = new ProblemSet( Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString()); Document d = ps.trainDocAt("a", "a_01.txt"); Assert.assertNotNull("No document a_01.txt found.", d); ps.removeTrainDocAt("a", d); ps.addTestDoc("a", d); Path path = Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml"); FullAPI test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results1 = test.getStatString(); System.out.println(results1); test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results2 = test.getStatString(); System.out.println(results2); Assert.assertEquals("Cached results different from non-cached results", results1, results2); }
/** Before each test, do some preparation. */ @Before public void setUp() { Chunker.shouldChunkTrainDocs(true); deleteRecursive(Paths.get(JSANConstants.JSAN_CACHE).toFile(), true); deleteRecursive(Paths.get(JSANConstants.JUNIT_RESOURCE_PACKAGE, "temp").toFile(), false); }
/** Stuff to do after we're all done. */ @AfterClass public static void testTearDown() { Chunker.shouldChunkTrainDocs(true); deleteRecursive(Paths.get(JSANConstants.JSAN_CACHE).toFile(), true); deleteRecursive(Paths.get(JSANConstants.JUNIT_RESOURCE_PACKAGE, "temp").toFile(), false); }
/** * Test to make sure that chunking does not change the results of the cache. It's a long test, but * it's very important. * * @throws ClassNotFoundException * @throws IllegalAccessException * @throws InstantiationException */ @Test public void testWithChunking() throws InstantiationException, IllegalAccessException, ClassNotFoundException { // the max difference allowed between chunked / non-chunked. double maxDifferential = 0.05; ProblemSet ps = new ProblemSet( Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString()); Document d = ps.trainDocAt("a", "a_01.txt"); Assert.assertNotNull("No document a_01.txt found.", d); ps.removeTrainDocAt("a", d); ps.addTestDoc("a", d); Path path = Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml"); FullAPI test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results1 = test.getStatString(); System.out.println(results1); test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results2 = test.getStatString(); System.out.println(results2); Assert.assertEquals("Cached results different from non-cached results", results1, results2); // make it rechunk, then test to make sure the results are the same. deleteRecursive(Paths.get(JSANConstants.JSAN_CHUNK_DIR).toFile(), true); test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results3 = test.getStatString(); System.out.println(results3); Assert.assertEquals("Cached results different from non-cached results", results2, results3); // now, keep the chunks, but delete the cache, and try again. File cacheDir = Paths.get(JSANConstants.JSAN_CACHE).toFile(); for (File f : cacheDir.listFiles()) { if (f.getName() != "chunked") { deleteRecursive(f, true); } } test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results4 = test.getStatString(); System.out.println(results4); Assert.assertEquals("Cached results different from non-cached results", results3, results4); // ---------------------------------------------------------------------------- // Turn off chunking and make sure the results are the same // There is good reason to test to make sure the results for this test are // (close to) the same, regardless of whether chunking is on or not. // ---------------------------------------------------------------------------- deleteRecursive(Paths.get(JSANConstants.JSAN_CACHE).toFile(), true); Chunker.shouldChunkTrainDocs(false); ps = new ProblemSet( Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString()); d = ps.trainDocAt("a", "a_01.txt"); Assert.assertNotNull("No document a_01.txt found.", d); ps.removeTrainDocAt("a", d); ps.addTestDoc("a", d); path = Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml"); test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); results4 = results4.trim(); String res4 = results4.substring(results4.lastIndexOf("\n") + 1); String[] allRes4 = Pattern.compile("|", Pattern.LITERAL).split(res4); String results5 = test.getStatString(); System.out.println(results5); String res5 = results5.trim().substring(results5.trim().lastIndexOf("\n") + 1); String[] allRes5 = Pattern.compile("|", Pattern.LITERAL).split(res5); for (int i = 1; i < allRes4.length; i++) { double num = Double.parseDouble(allRes4[i].trim().replace(" +", "")); double num2 = Double.parseDouble(allRes5[i].trim().replace(" +", "")); Assert.assertTrue( "There was a large difference between results for chunked / non-chunked." + " \nChunked: " + num + "; Non-chunked: " + num2, (Math.abs((num - num2)) / num2 <= maxDifferential)); } // Changed test to the one above.. since there WILL be differences between chunked/non-chunked, // we should just make sure that the difference isn't huge. // Assert.assertEquals("Chunked results different from non-chunked results", results4, // results5); // now do it again with the cache test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results6 = test.getStatString(); System.out.println(results6); Assert.assertEquals("Cached results different from non-cached results", results5, results6); // make it rechunk, then test to make sure the results are the same. deleteRecursive(Paths.get(JSANConstants.JSAN_CHUNK_DIR).toFile(), true); test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results7 = test.getStatString(); System.out.println(results7); Assert.assertEquals("Cached results different from non-cached results", results6, results7); // now, keep the chunks, but delete the cache, and try again. for (File f : cacheDir.listFiles()) { if (f.getName() != "chunked") { deleteRecursive(f, true); } } test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results8 = test.getStatString(); System.out.println(results8); Assert.assertEquals("Cached results different from non-cached results", results7, results8); }