public void testFilter() throws Exception {

    String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
    Configuration conf = NutchConfiguration.create();
    DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
    domainFilter.setConf(conf);
    assertNotNull(domainFilter.filter("http://lucene.apache.org"));
    assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
    assertNotNull(domainFilter.filter("http://www.apache.org"));
    assertNull(domainFilter.filter("http://www.google.com"));
    assertNull(domainFilter.filter("http://mail.yahoo.com"));
    assertNotNull(domainFilter.filter("http://www.foobar.net"));
    assertNotNull(domainFilter.filter("http://www.foobas.net"));
    assertNotNull(domainFilter.filter("http://www.yahoo.com"));
    assertNotNull(domainFilter.filter("http://www.foobar.be"));
    assertNull(domainFilter.filter("http://www.adobe.com"));
  }
Example #2
0
 @Test
 public void testNoFilter() throws Exception {
   // https://issues.apache.org/jira/browse/NUTCH-2189
   String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt";
   Configuration conf = NutchConfiguration.create();
   DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
   domainFilter.setConf(conf);
   Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
   Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
   Assert.assertNotNull(domainFilter.filter("http://www.apache.org"));
   Assert.assertNotNull(domainFilter.filter("http://www.google.com"));
   Assert.assertNotNull(domainFilter.filter("http://mail.yahoo.com"));
   Assert.assertNotNull(domainFilter.filter("http://www.foobar.net"));
   Assert.assertNotNull(domainFilter.filter("http://www.foobas.net"));
   Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com"));
   Assert.assertNotNull(domainFilter.filter("http://www.foobar.be"));
   Assert.assertNotNull(domainFilter.filter("http://www.adobe.com"));
 }