예제 #1
0
  public void CrawlRT(String RTPage) throws IOException {
    ArrayList<String> t = new ArrayList<String>();
    String crawlData;
    String crawlData2;
    String crawlData3;

    FileReader freader = new FileReader("Crawl.txt");
    BufferedReader br = new BufferedReader(freader);
    FileReader freader2 = new FileReader("Tocrawl.txt");
    BufferedReader br2 = new BufferedReader(freader2);
    FileWriter fwriter2 = new FileWriter("Tocrawl.txt", true);
    BufferedWriter bw2 = new BufferedWriter(fwriter2);
    FileWriter fwriter = new FileWriter("Crawl.txt", true);
    BufferedWriter bw = new BufferedWriter(fwriter);

    /*while(null != (crawlData2 = br.readLine()))
    {
    	if(crawlData2 !=null)
    		Crawl.add(crawlData2);
    }
    t = collectLinks(RTPage);
    Iterator<String> e3= t.iterator();
    while(e3.hasNext())
    {
    	String ee = e3.next();

    		if(!Crawl.contains(ee))
    		{
    			bw2.write(ee+"\r\n");
    		}



    }
    br.close();
    br2.close();
    bw.close();
    bw2.close();*/

    if (null == (crawlData = br.readLine()))
    // if(true)
    {
      // initial iteration
      bw.write(RTPage + "\r\n");
      Crawl.add(RTPage);
      t = collectLinks(RTPage);
      ToCrawl.addAll(t);
    } else {
      // collect data from files and load to array lists
      while (null != (crawlData2 = br.readLine())) {
        if (crawlData2 != null) Crawl.add(crawlData2);
      }

      while (null != (crawlData3 = br2.readLine())) {
        if (crawlData3 != null) ToCrawl.add(crawlData3);
      }
    }
    System.out.println("Crawlled");

    // Number of movies to be crawled
    for (int i = 0; i < 1000; i++) {
      if (ToCrawl.size() > 0) {
        Crawl.removeAll(Collections.singleton(null));
        ToCrawl.removeAll(Collections.singleton(null));
        String c = ToCrawl.get(0);
        if (Crawl.contains(c)) ToCrawl.remove(c);
        else {
          // collect links and collect data from a particular link
          Crawl.add(c);
          t = collectLinks(c);
          CollectData(c);
          ToCrawl.remove(c);
          Iterator<String> e3 = t.iterator();
          while (e3.hasNext()) {
            String ee = e3.next();
            if (!ToCrawl.contains(ee)) {
              if (!Crawl.contains(ee)) {
                ToCrawl.add(ee);
              }
            }
          }
          bw.write(c + "\r\n");
        }
      }
    }

    System.out.println("To Be Crawlled");
    Iterator<String> e2 = ToCrawl.iterator();
    while (e2.hasNext()) {
      // write to file the movies still to be crawled.
      bw2.write(e2.next() + "\r\n");
    }

    prop.setProperty("Id", Integer.toString(n));
    prop.store(new FileOutputStream("config.properties"), null);
    br.close();
    br2.close();
    bw.close();
    bw2.close();
  }
예제 #2
0
 public Crawler() throws IOException {
   // get the starting movie id
   prop.load(new FileInputStream("config.properties"));
   n = Integer.parseInt(prop.getProperty("Id"));
 }