public static void main(String[] args) { try { int maxLevel = 2; int maxThreads = 5; if (args.length == 4) { maxThreads = Integer.parseInt(args[3]); } if (args.length >= 3) { maxLevel = Integer.parseInt(args[2]); } if (args.length >= 2) { URLQueue q = new URLQueue(); q.setFilenamePrefix(args[1]); q.push(new URL(args[0]), 0); new WSDLCrawler(q, maxLevel, maxThreads); return; } } catch (Exception e) { System.err.println("An error occured: "); e.printStackTrace(); // System.err.println(e.toString()); } System.err.println( "Usage: java WSDLCrawler <url> <filenamePrefix> [<maxLevel> [<maxThreads>]]"); System.err.println("Crawls the web for WSDL descriptions."); }
@Override public void run() { SentenceSplitor sentenceSplitor = new SentenceSplitor(); CorpusWords cache_chars_unnormal = new CorpusWords("char_unnormal"); try { PageAnalyzer pageAnalyzer = new PageAnalyzer(); while (is_run) { String target_url = urlQueue.get_one_url(); if (target_url.length() > 0) { // System.out.printf("%s // \n", target_url); pageAnalyzer.set_taget_url(target_url); // // Content Data // ArrayList<String> content_datas = pageAnalyzer.getContentDatas(); // for (int i = 0; i < content_datas.size(); i ++) { // ArrayList<String> string_sentences = // sentenceSplitor.split_sentence(content_datas.get(i)); // for (int k = 0; k < string_sentences.size(); k++) { // String string_sentence = string_sentences.get(k); // if (string_sentence.length() > 35 || // string_sentence.length() == 0) // continue; // if (!isMessyCode(cache_chars_unnormal, string_sentence)) { // db.add_web_content(string_sentence); // } // } // } // Links urlQueue.add_urls(pageAnalyzer.getLinks()); } else { Thread.sleep(1000); } } } catch (InterruptedException e) { e.printStackTrace(); } }