/** * Test to make sure that chunking does not change the results. * * @throws ClassNotFoundException * @throws IllegalAccessException * @throws InstantiationException */ @Test public void testWithoutChunking() throws InstantiationException, IllegalAccessException, ClassNotFoundException { // TODO: this test is basically encapsulated in testWithChunking... so maybe get rid of this? Chunker.shouldChunkTrainDocs(false); ProblemSet ps = new ProblemSet( Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString()); Document d = ps.trainDocAt("a", "a_01.txt"); Assert.assertNotNull("No document a_01.txt found.", d); ps.removeTrainDocAt("a", d); ps.addTestDoc("a", d); Path path = Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml"); FullAPI test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results1 = test.getStatString(); System.out.println(results1); test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results2 = test.getStatString(); System.out.println(results2); Assert.assertEquals("Cached results different from non-cached results", results1, results2); }
/** * Test to make sure that chunking does not change the results. * * @throws ClassNotFoundException * @throws IllegalAccessException * @throws InstantiationException */ @Test public void testNoCache() throws InstantiationException, IllegalAccessException, ClassNotFoundException { ProblemSet ps = new ProblemSet( Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString()); Document d = ps.trainDocAt("a", "a_01.txt"); Assert.assertNotNull("No document a_01.txt found.", d); ps.removeTrainDocAt("a", d); ps.addTestDoc("a", d); Path path = Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml"); FullAPI test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .useCache(false) .build(); long bef1 = System.currentTimeMillis(); test.prepareInstances(); test.run(); long aft1 = System.currentTimeMillis(); String results1 = test.getStatString(); System.out.println(results1); test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .useCache(false) .build(); long bef2 = System.currentTimeMillis(); test.prepareInstances(); test.run(); long aft2 = System.currentTimeMillis(); String results2 = test.getStatString(); System.out.println(results2); long time1 = aft1 - bef1; long time2 = aft2 - bef2; // This assertion may be too lenient. Just trying to make sure that the time it takes the // second time is close to the time it takes the first time (in other words, so we know for sure // it did not use any extracted features) double percentDiff = (double) (Math.abs(time1 - time2)) / time2; System.out.println("Percent difference between two runs: " + percentDiff); File cache = Paths.get(JSANConstants.JSAN_CACHE).toFile(); File[] contents = cache.listFiles(); Assert.assertNotNull("No files in cache. Chunking directory should be there.", contents); boolean foundChunkingDirectory = false; for (File f : contents) { if (f.isDirectory() && f.getName().equals(CacheTests.CHUNKED_DIR_NAME)) foundChunkingDirectory = true; else { Assert.fail("The caching system made directories/files when caching was turned off."); } } Assert.assertTrue( "Chunking was turned on, but no chunking directory was found.", foundChunkingDirectory); }
/** * Test to make sure that chunking does not change the results of the cache. It's a long test, but * it's very important. * * @throws ClassNotFoundException * @throws IllegalAccessException * @throws InstantiationException */ @Test public void testWithChunking() throws InstantiationException, IllegalAccessException, ClassNotFoundException { // the max difference allowed between chunked / non-chunked. double maxDifferential = 0.05; ProblemSet ps = new ProblemSet( Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString()); Document d = ps.trainDocAt("a", "a_01.txt"); Assert.assertNotNull("No document a_01.txt found.", d); ps.removeTrainDocAt("a", d); ps.addTestDoc("a", d); Path path = Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml"); FullAPI test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results1 = test.getStatString(); System.out.println(results1); test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results2 = test.getStatString(); System.out.println(results2); Assert.assertEquals("Cached results different from non-cached results", results1, results2); // make it rechunk, then test to make sure the results are the same. deleteRecursive(Paths.get(JSANConstants.JSAN_CHUNK_DIR).toFile(), true); test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results3 = test.getStatString(); System.out.println(results3); Assert.assertEquals("Cached results different from non-cached results", results2, results3); // now, keep the chunks, but delete the cache, and try again. File cacheDir = Paths.get(JSANConstants.JSAN_CACHE).toFile(); for (File f : cacheDir.listFiles()) { if (f.getName() != "chunked") { deleteRecursive(f, true); } } test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results4 = test.getStatString(); System.out.println(results4); Assert.assertEquals("Cached results different from non-cached results", results3, results4); // ---------------------------------------------------------------------------- // Turn off chunking and make sure the results are the same // There is good reason to test to make sure the results for this test are // (close to) the same, regardless of whether chunking is on or not. // ---------------------------------------------------------------------------- deleteRecursive(Paths.get(JSANConstants.JSAN_CACHE).toFile(), true); Chunker.shouldChunkTrainDocs(false); ps = new ProblemSet( Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString()); d = ps.trainDocAt("a", "a_01.txt"); Assert.assertNotNull("No document a_01.txt found.", d); ps.removeTrainDocAt("a", d); ps.addTestDoc("a", d); path = Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml"); test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); results4 = results4.trim(); String res4 = results4.substring(results4.lastIndexOf("\n") + 1); String[] allRes4 = Pattern.compile("|", Pattern.LITERAL).split(res4); String results5 = test.getStatString(); System.out.println(results5); String res5 = results5.trim().substring(results5.trim().lastIndexOf("\n") + 1); String[] allRes5 = Pattern.compile("|", Pattern.LITERAL).split(res5); for (int i = 1; i < allRes4.length; i++) { double num = Double.parseDouble(allRes4[i].trim().replace(" +", "")); double num2 = Double.parseDouble(allRes5[i].trim().replace(" +", "")); Assert.assertTrue( "There was a large difference between results for chunked / non-chunked." + " \nChunked: " + num + "; Non-chunked: " + num2, (Math.abs((num - num2)) / num2 <= maxDifferential)); } // Changed test to the one above.. since there WILL be differences between chunked/non-chunked, // we should just make sure that the difference isn't huge. // Assert.assertEquals("Chunked results different from non-chunked results", results4, // results5); // now do it again with the cache test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results6 = test.getStatString(); System.out.println(results6); Assert.assertEquals("Cached results different from non-cached results", results5, results6); // make it rechunk, then test to make sure the results are the same. deleteRecursive(Paths.get(JSANConstants.JSAN_CHUNK_DIR).toFile(), true); test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results7 = test.getStatString(); System.out.println(results7); Assert.assertEquals("Cached results different from non-cached results", results6, results7); // now, keep the chunks, but delete the cache, and try again. for (File f : cacheDir.listFiles()) { if (f.getName() != "chunked") { deleteRecursive(f, true); } } test = new FullAPI.Builder() .cfdPath(path.toString()) .ps(ps) .setAnalyzer( new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance())) .numThreads(4) .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN) .build(); test.prepareInstances(); test.run(); String results8 = test.getStatString(); System.out.println(results8); Assert.assertEquals("Cached results different from non-cached results", results7, results8); }