public void CrawlRT(String RTPage) throws IOException { ArrayList<String> t = new ArrayList<String>(); String crawlData; String crawlData2; String crawlData3; FileReader freader = new FileReader("Crawl.txt"); BufferedReader br = new BufferedReader(freader); FileReader freader2 = new FileReader("Tocrawl.txt"); BufferedReader br2 = new BufferedReader(freader2); FileWriter fwriter2 = new FileWriter("Tocrawl.txt", true); BufferedWriter bw2 = new BufferedWriter(fwriter2); FileWriter fwriter = new FileWriter("Crawl.txt", true); BufferedWriter bw = new BufferedWriter(fwriter); /*while(null != (crawlData2 = br.readLine())) { if(crawlData2 !=null) Crawl.add(crawlData2); } t = collectLinks(RTPage); Iterator<String> e3= t.iterator(); while(e3.hasNext()) { String ee = e3.next(); if(!Crawl.contains(ee)) { bw2.write(ee+"\r\n"); } } br.close(); br2.close(); bw.close(); bw2.close();*/ if (null == (crawlData = br.readLine())) // if(true) { // initial iteration bw.write(RTPage + "\r\n"); Crawl.add(RTPage); t = collectLinks(RTPage); ToCrawl.addAll(t); } else { // collect data from files and load to array lists while (null != (crawlData2 = br.readLine())) { if (crawlData2 != null) Crawl.add(crawlData2); } while (null != (crawlData3 = br2.readLine())) { if (crawlData3 != null) ToCrawl.add(crawlData3); } } System.out.println("Crawlled"); // Number of movies to be crawled for (int i = 0; i < 1000; i++) { if (ToCrawl.size() > 0) { Crawl.removeAll(Collections.singleton(null)); ToCrawl.removeAll(Collections.singleton(null)); String c = ToCrawl.get(0); if (Crawl.contains(c)) ToCrawl.remove(c); else { // collect links and collect data from a particular link Crawl.add(c); t = collectLinks(c); CollectData(c); ToCrawl.remove(c); Iterator<String> e3 = t.iterator(); while (e3.hasNext()) { String ee = e3.next(); if (!ToCrawl.contains(ee)) { if (!Crawl.contains(ee)) { ToCrawl.add(ee); } } } bw.write(c + "\r\n"); } } } System.out.println("To Be Crawlled"); Iterator<String> e2 = ToCrawl.iterator(); while (e2.hasNext()) { // write to file the movies still to be crawled. bw2.write(e2.next() + "\r\n"); } prop.setProperty("Id", Integer.toString(n)); prop.store(new FileOutputStream("config.properties"), null); br.close(); br2.close(); bw.close(); bw2.close(); }
public Crawler() throws IOException { // get the starting movie id prop.load(new FileInputStream("config.properties")); n = Integer.parseInt(prop.getProperty("Id")); }