public static void main(String[] args) { Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("hadoop-default.xml"); conf.addResource("hadoop-site.xml"); conf.addResource("commoncrawl-default.xml"); conf.addResource("commoncrawl-site.xml"); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn01:9000/"); simpleTest(args[0]); }
public static void simpleTest(String outputFileName) { URLFPBloomFilter bigBloomFilter = new URLFPBloomFilter(750000000, 10, 11); TreeSet<URLFP> addedSet = new TreeSet<URLFP>(); TreeSet<URLFP> notAddedSet = new TreeSet<URLFP>(); for (int i = 0; i < 100000; ++i) { URLFP fingerprint = URLUtils.getURLFPFromURL("http://foo.bar.com/" + i, false); URLFP notfingerprint = URLUtils.getURLFPFromURL("http://someother.bar.com/" + i, false); addedSet.add(fingerprint); notAddedSet.add(notfingerprint); } System.out.println("Adding " + addedSet.size() + " elements to bloom filter"); long timeStart = System.currentTimeMillis(); for (URLFP testFingerprint : addedSet) { bigBloomFilter.add(testFingerprint); } long timeEnd = System.currentTimeMillis(); System.out.println("Add Took:" + (timeEnd - timeStart) + " MS"); timeStart = System.currentTimeMillis(); for (URLFP testFingerprint : addedSet) { if (!bigBloomFilter.isPresent(testFingerprint)) { Assert.assertFalse(true); } } timeEnd = System.currentTimeMillis(); System.out.println( "Lookup of " + addedSet.size() + " items in set took:" + (timeEnd - timeStart) + " MS"); timeStart = System.currentTimeMillis(); for (URLFP testFingerprint : notAddedSet) { if (bigBloomFilter.isPresent(testFingerprint)) { Assert.assertTrue(addedSet.contains(testFingerprint)); } } timeEnd = System.currentTimeMillis(); System.out.println( "Lookup of " + addedSet.size() + " items not in set took:" + (timeEnd - timeStart) + " MS"); System.out.println("Cloning"); URLFPBloomFilter clone = null; timeStart = System.currentTimeMillis(); try { clone = (URLFPBloomFilter) bigBloomFilter.clone(); } catch (CloneNotSupportedException e1) { e1.printStackTrace(); } timeEnd = System.currentTimeMillis(); System.out.println("Clone took:" + (timeEnd - timeStart) + " MS"); Path outputLocation = new Path(outputFileName); // serialize System.out.println("Serializing to:" + outputLocation); try { timeStart = System.currentTimeMillis(); FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); FSDataOutputStream outputStream = fs.create(outputLocation, true, 10240000); clone.serialize(outputStream); outputStream.flush(); outputStream.close(); timeEnd = System.currentTimeMillis(); System.out.println("Seialization took:" + (timeEnd - timeStart) + " MS"); clone = null; bigBloomFilter = null; System.out.println("Reloading"); timeStart = System.currentTimeMillis(); FSDataInputStream inputStream = fs.open(outputLocation); bigBloomFilter = URLFPBloomFilter.load(inputStream); inputStream.close(); timeEnd = System.currentTimeMillis(); System.out.println("Reload took:" + (timeEnd - timeStart) + " MS"); timeStart = System.currentTimeMillis(); for (URLFP testFingerprint : addedSet) { if (!bigBloomFilter.isPresent(testFingerprint)) { Assert.assertFalse(true); } } timeEnd = System.currentTimeMillis(); System.out.println( "Lookup of " + addedSet.size() + " items in set took:" + (timeEnd - timeStart) + " MS"); timeStart = System.currentTimeMillis(); for (URLFP testFingerprint : notAddedSet) { if (bigBloomFilter.isPresent(testFingerprint)) { Assert.assertTrue(addedSet.contains(testFingerprint)); } } timeEnd = System.currentTimeMillis(); System.out.println( "Lookup of " + addedSet.size() + " items not in set took:" + (timeEnd - timeStart) + " MS"); } catch (IOException e) { e.printStackTrace(); } }