public static void main(String[] args) {
   try {
     int maxLevel = 2;
     int maxThreads = 5;
     if (args.length == 4) {
       maxThreads = Integer.parseInt(args[3]);
     }
     if (args.length >= 3) {
       maxLevel = Integer.parseInt(args[2]);
     }
     if (args.length >= 2) {
       URLQueue q = new URLQueue();
       q.setFilenamePrefix(args[1]);
       q.push(new URL(args[0]), 0);
       new WSDLCrawler(q, maxLevel, maxThreads);
       return;
     }
   } catch (Exception e) {
     System.err.println("An error occured: ");
     e.printStackTrace();
     // System.err.println(e.toString());
   }
   System.err.println(
       "Usage: java WSDLCrawler <url> <filenamePrefix> [<maxLevel> [<maxThreads>]]");
   System.err.println("Crawls the web for WSDL descriptions.");
 }
Exemple #2
0
 @Override
 public void run() {
   SentenceSplitor sentenceSplitor = new SentenceSplitor();
   CorpusWords cache_chars_unnormal = new CorpusWords("char_unnormal");
   try {
     PageAnalyzer pageAnalyzer = new PageAnalyzer();
     while (is_run) {
       String target_url = urlQueue.get_one_url();
       if (target_url.length() > 0) {
         //                    System.out.printf("%s
         //         \n", target_url);
         pageAnalyzer.set_taget_url(target_url);
         //                    // Content Data
         //                    ArrayList<String> content_datas = pageAnalyzer.getContentDatas();
         //                    for (int i = 0; i < content_datas.size(); i ++) {
         //                        ArrayList<String> string_sentences =
         // sentenceSplitor.split_sentence(content_datas.get(i));
         //                        for (int k = 0; k < string_sentences.size(); k++) {
         //                            String string_sentence = string_sentences.get(k);
         //                            if (string_sentence.length() > 35 ||
         // string_sentence.length() == 0)
         //                                continue;
         //                            if (!isMessyCode(cache_chars_unnormal, string_sentence)) {
         //                                db.add_web_content(string_sentence);
         //                            }
         //                        }
         //                    }
         // Links
         urlQueue.add_urls(pageAnalyzer.getLinks());
       } else {
         Thread.sleep(1000);
       }
     }
   } catch (InterruptedException e) {
     e.printStackTrace();
   }
 }