public static void main(final String[] args) { String url, html; HtmlLangDetector detector = new HtmlLangDetector("conf/lang.properties"); // fileName = "/home/xcyao/CityU/work/welcome.html"; url = "http://news.sina.com.cn"; // url = "http://ngramj.sourceforge.net/use_ngramj.html"; // url = "http://www.let.rug.nl/~vannoord/TextCat/ShortTexts/dutch.txt"; url = "http://alias-i.com/lingpipe/demos/tutorial/langid/read-me.html"; url = "http://www.cas.gov.hk/eng/notice/notice_remove.html"; url = "http://www.wsd.gov.hk/tc/job_opportunities/index_t.html"; url = "http://www.wsd.gov.hk/sc/customer_services_and_water_bills/application_for_water_supply/water_supply_for_new_buildings/index_t.html"; // html = HtmlLangDetector.fetch(url); // html = "证券简称,今日开盘价,昨日收盘价,最近成交价,最高成交价,最低成交价,买入价"; // html = "证券简称,今日开盘价,昨日收盘价,最近成交价,最高成交价,最低成交价,买入价"; // html = "今日开盘价,的,好,人,了,是,不"; html = HTML2TEXT.getText(url); System.out.println(html); System.out.println(detector.guess(html)); }
public String detect(final String url) { String text = HTML2TEXT.getText(url); return guess(text); }