Java Chunker Exemples, edu.drexel.psal.jstylo.featureProcessing.Chunker Java Exemples

Exemple #1

0

Afficher le fichier

Fichier : CacheTests.java Projet : tdutko/jstylo

  /**
   * Test to make sure that chunking does not change the results.
   *
   * @throws ClassNotFoundException
   * @throws IllegalAccessException
   * @throws InstantiationException
   */
  @Test
  public void testWithoutChunking()
      throws InstantiationException, IllegalAccessException, ClassNotFoundException {
    // TODO: this test is basically encapsulated in testWithChunking... so maybe get rid of this?
    Chunker.shouldChunkTrainDocs(false);
    ProblemSet ps =
        new ProblemSet(
            Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString());
    Document d = ps.trainDocAt("a", "a_01.txt");
    Assert.assertNotNull("No document a_01.txt found.", d);
    ps.removeTrainDocAt("a", d);
    ps.addTestDoc("a", d);
    Path path =
        Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml");
    FullAPI test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results1 = test.getStatString();
    System.out.println(results1);

    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results2 = test.getStatString();
    System.out.println(results2);

    Assert.assertEquals("Cached results different from non-cached results", results1, results2);
  }

Exemple #2

0

Afficher le fichier

Fichier : CacheTests.java Projet : tdutko/jstylo

 /** Before each test, do some preparation. */
 @Before
 public void setUp() {
   Chunker.shouldChunkTrainDocs(true);
   deleteRecursive(Paths.get(JSANConstants.JSAN_CACHE).toFile(), true);
   deleteRecursive(Paths.get(JSANConstants.JUNIT_RESOURCE_PACKAGE, "temp").toFile(), false);
 }

Exemple #3

0

Afficher le fichier

Fichier : CacheTests.java Projet : tdutko/jstylo

 /** Stuff to do after we're all done. */
 @AfterClass
 public static void testTearDown() {
   Chunker.shouldChunkTrainDocs(true);
   deleteRecursive(Paths.get(JSANConstants.JSAN_CACHE).toFile(), true);
   deleteRecursive(Paths.get(JSANConstants.JUNIT_RESOURCE_PACKAGE, "temp").toFile(), false);
 }

Exemple #4

0

Afficher le fichier

Fichier : CacheTests.java Projet : tdutko/jstylo

  /**
   * Test to make sure that chunking does not change the results of the cache. It's a long test, but
   * it's very important.
   *
   * @throws ClassNotFoundException
   * @throws IllegalAccessException
   * @throws InstantiationException
   */
  @Test
  public void testWithChunking()
      throws InstantiationException, IllegalAccessException, ClassNotFoundException {
    // the max difference allowed between chunked / non-chunked.
    double maxDifferential = 0.05;

    ProblemSet ps =
        new ProblemSet(
            Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString());
    Document d = ps.trainDocAt("a", "a_01.txt");
    Assert.assertNotNull("No document a_01.txt found.", d);
    ps.removeTrainDocAt("a", d);
    ps.addTestDoc("a", d);
    Path path =
        Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml");
    FullAPI test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results1 = test.getStatString();
    System.out.println(results1);

    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results2 = test.getStatString();
    System.out.println(results2);

    Assert.assertEquals("Cached results different from non-cached results", results1, results2);

    // make it rechunk, then test to make sure the results are the same.
    deleteRecursive(Paths.get(JSANConstants.JSAN_CHUNK_DIR).toFile(), true);
    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results3 = test.getStatString();
    System.out.println(results3);
    Assert.assertEquals("Cached results different from non-cached results", results2, results3);

    // now, keep the chunks, but delete the cache, and try again.
    File cacheDir = Paths.get(JSANConstants.JSAN_CACHE).toFile();
    for (File f : cacheDir.listFiles()) {
      if (f.getName() != "chunked") {
        deleteRecursive(f, true);
      }
    }
    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results4 = test.getStatString();
    System.out.println(results4);
    Assert.assertEquals("Cached results different from non-cached results", results3, results4);

    // ----------------------------------------------------------------------------
    // Turn off chunking and make sure the results are the same
    // There is good reason to test to make sure the results for this test are
    // (close to) the same, regardless of whether chunking is on or not.
    // ----------------------------------------------------------------------------
    deleteRecursive(Paths.get(JSANConstants.JSAN_CACHE).toFile(), true);
    Chunker.shouldChunkTrainDocs(false);
    ps =
        new ProblemSet(
            Paths.get(JSANConstants.JSAN_PROBLEMSETS_PREFIX, "drexel_1_small.xml").toString());
    d = ps.trainDocAt("a", "a_01.txt");
    Assert.assertNotNull("No document a_01.txt found.", d);
    ps.removeTrainDocAt("a", d);
    ps.addTestDoc("a", d);
    path = Paths.get(JSANConstants.JSAN_FEATURESETS_PREFIX, "writeprints_feature_set_limited.xml");
    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();

    results4 = results4.trim();
    String res4 = results4.substring(results4.lastIndexOf("\n") + 1);
    String[] allRes4 = Pattern.compile("|", Pattern.LITERAL).split(res4);
    String results5 = test.getStatString();
    System.out.println(results5);
    String res5 = results5.trim().substring(results5.trim().lastIndexOf("\n") + 1);
    String[] allRes5 = Pattern.compile("|", Pattern.LITERAL).split(res5);
    for (int i = 1; i < allRes4.length; i++) {
      double num = Double.parseDouble(allRes4[i].trim().replace(" +", ""));
      double num2 = Double.parseDouble(allRes5[i].trim().replace(" +", ""));
      Assert.assertTrue(
          "There was a large difference between results for chunked / non-chunked."
              + " \nChunked: "
              + num
              + "; Non-chunked: "
              + num2,
          (Math.abs((num - num2)) / num2 <= maxDifferential));
    }

    // Changed test to the one above.. since there WILL be differences between chunked/non-chunked,
    // we should just make sure that the difference isn't huge.
    // Assert.assertEquals("Chunked results different from non-chunked results", results4,
    // results5);

    // now do it again with the cache
    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results6 = test.getStatString();
    System.out.println(results6);

    Assert.assertEquals("Cached results different from non-cached results", results5, results6);

    // make it rechunk, then test to make sure the results are the same.
    deleteRecursive(Paths.get(JSANConstants.JSAN_CHUNK_DIR).toFile(), true);
    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results7 = test.getStatString();
    System.out.println(results7);
    Assert.assertEquals("Cached results different from non-cached results", results6, results7);

    // now, keep the chunks, but delete the cache, and try again.
    for (File f : cacheDir.listFiles()) {
      if (f.getName() != "chunked") {
        deleteRecursive(f, true);
      }
    }
    test =
        new FullAPI.Builder()
            .cfdPath(path.toString())
            .ps(ps)
            .setAnalyzer(
                new WekaAnalyzer(Class.forName("weka.classifiers.functions.SMO").newInstance()))
            .numThreads(4)
            .analysisType(FullAPI.analysisType.TRAIN_TEST_UNKNOWN)
            .build();
    test.prepareInstances();
    test.run();
    String results8 = test.getStatString();
    System.out.println(results8);
    Assert.assertEquals("Cached results different from non-cached results", results7, results8);
  }