public void setCategories(AccessChecker accessChecker, String[] categories) { tree.removeAll(); if (categories == null) return; if (accessChecker == null) { for (String category : categories) { lookCategoryItem(category); } return; } group = creator.getSelectedGroupName(); StringBuilder builder = new StringBuilder(); for (String category : categories) { if (!accessChecker.isPermitAccess(group + "." + category, true)) continue; lookCategoryItem(category); if (builder.length() > 0) builder.append('\n'); builder.append(category); } File file = new File(ClientConnector2.getCacheFolder("sources/type"), "group." + group); try { byte[] bytes = builder.toString().getBytes(Application.CHARSET); RWData.getInstance().save(file, bytes); } catch (Exception e) { file.delete(); } }
public static void main(String[] args) throws Exception { String address = "http://vnexpress.net/GL/Xa-hoi/2009/02/3BA0B4AB/"; webClient.setURL(address, new URL(address)); // String address = "http://vnmedia.vn/newsdetail.asp?NewsId=154558&CatId=58"; java.net.URL url = new java.net.URL(address); HTMLDocument document = HTMLParser.createDocument(loadContent(address), "utf-8"); RefsDecoder decoder = new RefsDecoder(); NodeIterator iterator = document.getRoot().iterator(); while (iterator.hasNext()) { HTMLNode node = iterator.next(); if (!node.isNode(Name.CONTENT)) continue; char[] chars = node.getValue(); chars = decoder.decode(chars); chars = CharsUtil.cutAndTrim(chars, 0, chars.length); chars = java.text.Normalizer.normalize(new String(chars), Normalizer.Form.NFC).toCharArray(); node.setValue(chars); } loadCSS(address, document); NodePath nodePath = pathParser.toPath("BODY"); HTMLNode body = extractor.lookNode(document.getRoot(), nodePath); WebPageDataSearcher dataSearcher = new WebPageDataSearcher(document); HTMLNode node = dataSearcher.search(body); File file = new File("F:\\Temp2\\web\\output\\extract.htm"); byte[] bytes = new byte[0]; if (node != null) bytes = node.getTextValue().getBytes(Application.CHARSET); RWData.getInstance().save(file, bytes); }
public static void main(String[] args) throws Exception { File file = new File("D:\\Temp\\print.asp.htm"); System.out.println(file.length()); byte[] bytes = RWData.getInstance().load(file); String value = new String(bytes, "utf-8"); System.out.println(value.length()); }
public static void main(String[] args) throws Exception { WebClient webClient = new WebClient(); String homepage = "http://mail.google.com/"; webClient.setURL(homepage, new URL(homepage)); HttpHost httpHost = webClient.createHttpHost(homepage); HttpGet httpGet = webClient.createGetMethod(homepage, "http://www.google.com"); HttpResponse response = webClient.execute(httpHost, httpGet); HttpEntity entity = response.getEntity(); System.out.println("Login form get: " + response.getStatusLine()); if (entity != null) entity.consumeContent(); System.out.println("Initial set of cookies:"); DefaultHttpClient httpClient = (DefaultHttpClient) webClient.getHttpClient(); List<Cookie> cookies = httpClient.getCookieStore().getCookies(); if (cookies.isEmpty()) { System.out.println("None"); } else { for (int i = 0; i < cookies.size(); i++) { System.out.println("- " + cookies.get(i).toString()); } } HttpMethodHandler handler = new HttpMethodHandler(webClient); HttpSessionUtils httpSession = new HttpSessionUtils(handler, "ERROR"); StringBuilder builder = new StringBuilder( "https://www.google.com/accounts/ServiceLogin?service=mail&passive=true&rm=false&continue=http%3A%2F%2Fmail.google.com%2Fmail%2F%3Fui%3Dhtml%26zy%3Dl&bsv=1k96igf4806cy<mpl=default<mplcache=2"); builder.append('\n').append("username:password"); httpSession.login(builder.toString(), "utf-8", new URL(homepage), homepage); httpGet = webClient.createGetMethod("http://mail.google.com/mail/", "http://gmail.com"); response = webClient.execute(httpHost, httpGet); entity = response.getEntity(); HttpMethodHandler httpResponseReader = new HttpMethodHandler(webClient); byte[] bytes = httpResponseReader.readBody(response); org.vietspider.common.io.RWData.getInstance().save(new File("google_mail.html"), bytes); System.out.println("Login form get: " + response.getStatusLine()); if (entity != null) entity.consumeContent(); System.out.println("Post logon cookies:"); cookies = httpClient.getCookieStore().getCookies(); if (cookies.isEmpty()) { System.out.println("None"); } else { for (int i = 0; i < cookies.size(); i++) { System.out.println("- " + cookies.get(i).toString()); } } }
protected void saveIndex(File file) { try { ByteArrayOutputStream bytesOutput = new ByteArrayOutputStream(); ObjectOutputStream objectOutputStream = new ObjectOutputStream(bytesOutput); objectOutputStream.writeObject(wordIndex2); objectOutputStream.close(); if (file.exists()) file.delete(); RWData.getInstance().save(file, bytesOutput.toByteArray()); } catch (Exception e) { LogService.getInstance().setThrowable(e); } }
public static void main(String[] args) throws Exception { File file = new File("D:\\java\\headvances3\\trunk\\vietspider\\startup\\src\\test\\data\\"); System.setProperty("vietspider.data.path", file.getCanonicalPath()); TextTranslator translator = new TextTranslator(); NonDictionaryIO nonDictionaryIO = new NonDictionaryIO(); nonDictionaryIO.importFileNew("them.txt"); // NonSignDictionary.getInstance().remove("kinh mong"); // NonSignDictionary.getInstance().insert("kính mong"); String text = "oi tau ngon lam day"; // System.out.println(translator.compile(text)); file = UtilFile.getFile("system/dictionary/vn/non/", "input.txt"); byte[] bytes = RWData.getInstance().load(file); text = new String(bytes, "utf-8"); file = UtilFile.getFile("system/dictionary/vn/non/", "out.txt"); org.vietspider.common.io.RWData.getInstance() .save(file, translator.compile(text).getBytes("utf-8")); }
private void loadIgnore() { File folder = UtilFile.getFolder("system/dictionary/vn/"); File file = new File(folder, "default.ignore.key.tp"); File txtFile = new File(folder, "default.ignore.word.txt"); // System.out.println(file.lastModified() + " : "+txtFile.lastModified()); if (!file.exists() || file.length() < 1 || (txtFile.exists() && txtFile.lastModified() > file.lastModified())) { wordIndex2 = new WordIndex2(0); try { String text = new String(RWData.getInstance().load(txtFile), Application.CHARSET); String[] values = text.split(","); for (String value : values) { value = value.trim(); if (value.isEmpty()) continue; // System.out.println(" === >"+ value); wordIndex2.add(value.toLowerCase()); } // TreeSet<WordIndex2> aaa = wordIndex2.getChildren(); // Iterator<WordIndex2> iterator = aaa.iterator(); // while(iterator.hasNext()) { // System.out.println(iterator.next().getCode()); // } saveIndex(); } catch (Exception e) { LogService.getInstance().setThrowable(e); } /*"Vietnamnet","GMT","VN","TimeString", "E mail", "tp", "Việt Nam", "người việt", "Ảnh","TT","Công","Giám","HCM","Hà Nội","TNHH","SG","VNN","VNExpress", "TP HCM","Trung Quốc","Mỹ","Nhật Bản","DN","TPHCM","TP.HCM","Nga","Anh","VCK", "CLB","CĐV", "CP", "NĐ", "ND", "TTg", "Quyết định", "Nghị quyết", "Hồ Chí Minh", "HCM", "tp hồ chí minh", "sài gòn", "HN","TP Hà Nội","UBND","UBND TP", "Đoàn","Đảng", "Ban","Chiều","Học","Hôm","Ngày","and","or", "nhà nước", "việt", "chính phủ", "trung ương", "thủ tướng", "đề án", "thành phố","thủ đô", "thường trực", "phó chủ tịch", "Quốc hội" , "Công ty", "Cổ phần", "cập nhật", "Chủ tịch" , "hoa kỳ", "liên hợp quốc", "washington", "tổng thống", "Mỹ", "USD" , "lao động", "lđ", "cty", "Vietnam+"*/ // "An Giang","Bà Rịa-Vũng Tàu","Bạc Liêu","Bắc Cạn","Bắc Giang","Bắc Ninh", // "Bến Tre","Bình Dương","Bình Định","Bình Phước","Bình Thuận","Cà Mau", // "Cao Bằng","Cần Thơ","Đà Nẵng","Đắk Lắk","Đắk Nông","Điện Biên","Đồng Nai", // "Đồng Tháp","Gia Lai","Hà Giang","Hà Nam","Hà Nội","Hà Tĩnh","Hải Dương", // "Hải Phòng","Hậu Giang","Hòa Bình","Hồ Chí Minh","Hưng Yên","Khánh Hoà", // "Kiên Giang","Kon Tum","Lai Châu","Lạng Sơn","Lào Cai","Lâm Đồng","Long An", // "Nam Định","Nghệ An","Ninh Bình","Ninh Thuận","Phú Thọ","Phú Yên","Quảng Bình", // "Quảng Nam","Quảng Ngãi","Quảng Ninh","Quảng Trị","Sóc Trăng","Sơn La","Tây Ninh", // "Thái Bình","Thái Nguyên","Thanh Hoá","Thừa Thiên-Huế","Tiền Giang","Trà Vinh", // "Tuyên Quang","Vĩnh Long","Vĩnh Phúc","Yên Bái", // "thái lan","đài loan" return; } try { byte[] bytes = RWData.getInstance().load(file); ObjectInputStream objectInput = new ObjectInputStream(new ByteArrayInputStream(bytes)); wordIndex2 = (WordIndex2) objectInput.readObject(); objectInput.close(); return; } catch (Exception e) { file.delete(); LogService.getInstance().setThrowable(e); } }