public Map<String, String> attempt(Element element) { Map<String, String> attributes = new HashMap<String, String>(); for (Entry<String, Matcher> entry : matchers.entrySet()) { if (entry.getValue().test(element)) { attributes.put(entry.getKey(), decode(element.text())); } } for (Entry<String, Matcher> entry : textMatchers.entrySet()) { if (entry.getValue().test(element)) { Node textNode = element.nextSibling(); if (null != textNode) { attributes.put(entry.getKey(), decode(textNode.outerHtml())); } } } for (Entry<String, Matcher> entry : subtextMatchers.entrySet()) { if (entry.getValue().test(element)) { TextNode textNode = element.textNodes().get(0); if (null != textNode) { attributes.put(entry.getKey(), decode(textNode.outerHtml())); } } } for (Entry<String, Matcher> entry : htmlMatchers.entrySet()) { if (entry.getValue().test(element)) { attributes.put(entry.getKey(), element.html()); } } for (Entry<String, Matcher> entry : ptextMatchers.entrySet()) { if (entry.getValue().test(element)) { attributes.put(entry.getKey(), plainTextFormatter.getPlainText(element)); } } for (Entry<String, Object[]> entry : attrMatchers.entrySet()) { Object[] objects = entry.getValue(); Matcher matcher = (Matcher) objects[0]; String attr = (String) objects[1]; if (matcher.test(element)) { attributes.put(entry.getKey(), element.attr(attr)); } } return attributes; }
public static final List<ContentValues> synchronousFav(LoginInfo loginInfo) throws IOException { List<ContentValues> favList = new ArrayList<ContentValues>(); String urlString = "http://bbs.nju.edu.cn/" + loginInfo.getLoginCode() + "/bbsmybrd"; HttpClient client; BasicHttpParams httpParameters = new BasicHttpParams(); // Set the timeout in milliseconds until a connection is established. HttpConnectionParams.setConnectionTimeout(httpParameters, 10000); HttpConnectionParams.setSoTimeout(httpParameters, 10000); client = new DefaultHttpClient(httpParameters); client.getParams().setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1); HttpGet uploadGet = new HttpGet(urlString); uploadGet.addHeader("Cookie", loginInfo.getLoginCookie()); HttpResponse httpResponse = client.execute(uploadGet); if (httpResponse.getStatusLine().getStatusCode() == 200) { Document doc = Jsoup.parse(EntityUtils.toString(httpResponse.getEntity())); Elements boards = doc.select("input[checked]"); if (boards.size() == 0) { return new ArrayList<ContentValues>(); } for (Element board : boards) { ContentValues values = new ContentValues(); String boardName = board.nextSibling().toString(); boardName = boardName.substring(boardName.indexOf(">") + 1); boardName = boardName.substring(0, boardName.indexOf("<")); values.put("english", boardName.substring(0, boardName.indexOf("("))); values.put( "chinese", boardName.substring(boardName.indexOf("(") + 1, boardName.length() - 1)); values.put("islocal", 0); favList.add(values); } } else { throw new IOException(); } return favList; }
public int computeScore(File file, Element link, String query, URL parentUrl) throws IOException { if (query == null) { return 0; } // substring String anchor = link.text().toLowerCase(); String[] queryTerms = query.split(" "); int K = 0; for (String q : queryTerms) { if (anchor.contains(q)) { K++; } } if (K > 0) { return (K * 50); } // substring String url = link.attr("href").toLowerCase(); K = 0; for (String q : queryTerms) { if (url.contains(q)) { K++; } } if (K > 0) { return 40; } int U = 0; int V = 0; List<String> neighborWords = new ArrayList<String>(); List<String> words = getPrevNeighbors(link.previousSibling()); if (words != null) { neighborWords.addAll(words); } words = getNextNeighbors(link.nextSibling()); if (words != null) { neighborWords.addAll(words); } for (String q : queryTerms) { if (neighborWords.contains(q)) { U++; } } BufferedReader br = new BufferedReader(new FileReader(file)); Document doc = Jsoup.parse(file, "UTF-8", parentUrl.toString()); String rawText = doc.text(); String[] raw = rawText.split(" "); List<String> rawTextList = new ArrayList<String>(); for (String s : raw) { if (!s.matches("^[a-zA-Z0-9]+$")) { s = s.replaceAll("[^\\p{Alpha}\\p{Digit}]+", ""); } rawTextList.add(s.toLowerCase()); } for (String q : queryTerms) { if (rawTextList.contains(q)) { V++; } } br.close(); int score = 4 * U + Math.abs(V - U); return score; }