Exemplo n.º 1
0
  /**
   * Test to make sure that chunking does not change the results.
   *
   * @throws ClassNotFoundException
   * @throws IllegalAccessException
   * @throws InstantiationException
   */
  @Test
  public void testWithoutChunking()
      throws InstantiationException, IllegalAccessException, ClassNotFoundException {
    // TODO: this test is basically encapsulated in testWithChunking... so maybe get rid of this?
    Chunker.shouldChunkTrainDocs(false);
    ProblemSet ps =
        new ProblemSet(
            Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString());
    Document d = ps.trainDocAt("a", "a_01.txt");
    Assert.assertNotNull("No document a_01.txt found.", d);
    ps.removeTrainDocAt("a", d);
    ps.addTestDoc("a", d);
    Path path =
        Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml");
    FullAPI test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results1 = test.getStatString();
    System.out.println(results1);

    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results2 = test.getStatString();
    System.out.println(results2);

    Assert.assertEquals("Cached results different from non-cached results", results1, results2);
  }
Exemplo n.º 2
0
  /**
   * Test to make sure that chunking does not change the results.
   *
   * @throws ClassNotFoundException
   * @throws IllegalAccessException
   * @throws InstantiationException
   */
  @Test
  public void testNoCache()
      throws InstantiationException, IllegalAccessException, ClassNotFoundException {
    ProblemSet ps =
        new ProblemSet(
            Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString());
    Document d = ps.trainDocAt("a", "a_01.txt");
    Assert.assertNotNull("No document a_01.txt found.", d);
    ps.removeTrainDocAt("a", d);
    ps.addTestDoc("a", d);
    Path path =
        Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml");
    FullAPI test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .useCache(false)
            .build();
    long bef1 = System.currentTimeMillis();
    test.prepareInstances();
    test.run();
    long aft1 = System.currentTimeMillis();
    String results1 = test.getStatString();
    System.out.println(results1);

    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .useCache(false)
            .build();
    long bef2 = System.currentTimeMillis();
    test.prepareInstances();
    test.run();
    long aft2 = System.currentTimeMillis();
    String results2 = test.getStatString();
    System.out.println(results2);

    long time1 = aft1 - bef1;
    long time2 = aft2 - bef2;

    // This assertion may be too lenient. Just trying to make sure that the time it takes the
    // second time is close to the time it takes the first time (in other words, so we know for sure
    // it did not use any extracted features)
    double percentDiff = (double) (Math.abs(time1 - time2)) / time2;
    System.out.println("Percent difference between two runs: " + percentDiff);

    File cache = Paths.get(JSANConstants.JSAN_CACHE).toFile();
    File[] contents = cache.listFiles();
    Assert.assertNotNull("No files in cache. Chunking directory should be there.", contents);
    boolean foundChunkingDirectory = false;
    for (File f : contents) {
      if (f.isDirectory() && f.getName().equals(CacheTests.CHUNKED_DIR_NAME))
        foundChunkingDirectory = true;
      else {
        Assert.fail("The caching system made directories/files when caching was turned off.");
      }
    }
    Assert.assertTrue(
        "Chunking was turned on, but no chunking directory was found.", foundChunkingDirectory);
  }
Exemplo n.º 3
0
  /**
   * Test to make sure that chunking does not change the results of the cache. It's a long test, but
   * it's very important.
   *
   * @throws ClassNotFoundException
   * @throws IllegalAccessException
   * @throws InstantiationException
   */
  @Test
  public void testWithChunking()
      throws InstantiationException, IllegalAccessException, ClassNotFoundException {
    // the max difference allowed between chunked / non-chunked.
    double maxDifferential = 0.05;

    ProblemSet ps =
        new ProblemSet(
            Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString());
    Document d = ps.trainDocAt("a", "a_01.txt");
    Assert.assertNotNull("No document a_01.txt found.", d);
    ps.removeTrainDocAt("a", d);
    ps.addTestDoc("a", d);
    Path path =
        Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml");
    FullAPI test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results1 = test.getStatString();
    System.out.println(results1);

    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results2 = test.getStatString();
    System.out.println(results2);

    Assert.assertEquals("Cached results different from non-cached results", results1, results2);

    // make it rechunk, then test to make sure the results are the same.
    deleteRecursive(Paths.get(JSANConstants.JSAN_CHUNK_DIR).toFile(), true);
    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results3 = test.getStatString();
    System.out.println(results3);
    Assert.assertEquals("Cached results different from non-cached results", results2, results3);

    // now, keep the chunks, but delete the cache, and try again.
    File cacheDir = Paths.get(JSANConstants.JSAN_CACHE).toFile();
    for (File f : cacheDir.listFiles()) {
      if (f.getName() != "chunked") {
        deleteRecursive(f, true);
      }
    }
    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results4 = test.getStatString();
    System.out.println(results4);
    Assert.assertEquals("Cached results different from non-cached results", results3, results4);

    // ----------------------------------------------------------------------------
    // Turn off chunking and make sure the results are the same
    // There is good reason to test to make sure the results for this test are
    // (close to) the same, regardless of whether chunking is on or not.
    // ----------------------------------------------------------------------------
    deleteRecursive(Paths.get(JSANConstants.JSAN_CACHE).toFile(), true);
    Chunker.shouldChunkTrainDocs(false);
    ps =
        new ProblemSet(
            Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString());
    d = ps.trainDocAt("a", "a_01.txt");
    Assert.assertNotNull("No document a_01.txt found.", d);
    ps.removeTrainDocAt("a", d);
    ps.addTestDoc("a", d);
    path = Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml");
    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();

    results4 = results4.trim();
    String res4 = results4.substring(results4.lastIndexOf("\n") + 1);
    String[] allRes4 = Pattern.compile("|", Pattern.LITERAL).split(res4);
    String results5 = test.getStatString();
    System.out.println(results5);
    String res5 = results5.trim().substring(results5.trim().lastIndexOf("\n") + 1);
    String[] allRes5 = Pattern.compile("|", Pattern.LITERAL).split(res5);
    for (int i = 1; i < allRes4.length; i++) {
      double num = Double.parseDouble(allRes4[i].trim().replace(" +", ""));
      double num2 = Double.parseDouble(allRes5[i].trim().replace(" +", ""));
      Assert.assertTrue(
          "There was a large difference between results for chunked / non-chunked."
              + " \nChunked: "
              + num
              + "; Non-chunked: "
              + num2,
          (Math.abs((num - num2)) / num2 <= maxDifferential));
    }

    // Changed test to the one above.. since there WILL be differences between chunked/non-chunked,
    // we should just make sure that the difference isn't huge.
    // Assert.assertEquals("Chunked results different from non-chunked results", results4,
    // results5);

    // now do it again with the cache
    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results6 = test.getStatString();
    System.out.println(results6);

    Assert.assertEquals("Cached results different from non-cached results", results5, results6);

    // make it rechunk, then test to make sure the results are the same.
    deleteRecursive(Paths.get(JSANConstants.JSAN_CHUNK_DIR).toFile(), true);
    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results7 = test.getStatString();
    System.out.println(results7);
    Assert.assertEquals("Cached results different from non-cached results", results6, results7);

    // now, keep the chunks, but delete the cache, and try again.
    for (File f : cacheDir.listFiles()) {
      if (f.getName() != "chunked") {
        deleteRecursive(f, true);
      }
    }
    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results8 = test.getStatString();
    System.out.println(results8);
    Assert.assertEquals("Cached results different from non-cached results", results7, results8);
  }