예제 #1
0
  public static void main(final String[] args) {
    String url, html;
    HtmlLangDetector detector = new HtmlLangDetector("conf/lang.properties");
    // fileName = "/home/xcyao/CityU/work/welcome.html";
    url = "http://news.sina.com.cn";
    //		url = "http://ngramj.sourceforge.net/use_ngramj.html";
    //		url = "http://www.let.rug.nl/~vannoord/TextCat/ShortTexts/dutch.txt";
    url = "http://alias-i.com/lingpipe/demos/tutorial/langid/read-me.html";
    url = "http://www.cas.gov.hk/eng/notice/notice_remove.html";
    url = "http://www.wsd.gov.hk/tc/job_opportunities/index_t.html";
    url =
        "http://www.wsd.gov.hk/sc/customer_services_and_water_bills/application_for_water_supply/water_supply_for_new_buildings/index_t.html";

    // html = HtmlLangDetector.fetch(url);
    // html = "证券简称,今日开盘价,昨日收盘价,最近成交价,最高成交价,最低成交价,买入价";
    // html = "证券简称,今日开盘价,昨日收盘价,最近成交价,最高成交价,最低成交价,买入价";
    // html = "今日开盘价,的,好,人,了,是,不";
    html = HTML2TEXT.getText(url);

    System.out.println(html);
    System.out.println(detector.guess(html));
  }
예제 #2
0
 public String detect(final String url) {
   String text = HTML2TEXT.getText(url);
   return guess(text);
 }