Beispiel #1
0
  public Map<String, String> attempt(Element element) {
    Map<String, String> attributes = new HashMap<String, String>();
    for (Entry<String, Matcher> entry : matchers.entrySet()) {
      if (entry.getValue().test(element)) {
        attributes.put(entry.getKey(), decode(element.text()));
      }
    }

    for (Entry<String, Matcher> entry : textMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        Node textNode = element.nextSibling();
        if (null != textNode) {
          attributes.put(entry.getKey(), decode(textNode.outerHtml()));
        }
      }
    }

    for (Entry<String, Matcher> entry : subtextMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        TextNode textNode = element.textNodes().get(0);
        if (null != textNode) {
          attributes.put(entry.getKey(), decode(textNode.outerHtml()));
        }
      }
    }

    for (Entry<String, Matcher> entry : htmlMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        attributes.put(entry.getKey(), element.html());
      }
    }

    for (Entry<String, Matcher> entry : ptextMatchers.entrySet()) {
      if (entry.getValue().test(element)) {
        attributes.put(entry.getKey(), plainTextFormatter.getPlainText(element));
      }
    }

    for (Entry<String, Object[]> entry : attrMatchers.entrySet()) {
      Object[] objects = entry.getValue();
      Matcher matcher = (Matcher) objects[0];
      String attr = (String) objects[1];
      if (matcher.test(element)) {
        attributes.put(entry.getKey(), element.attr(attr));
      }
    }
    return attributes;
  }
Beispiel #2
0
  public static final List<ContentValues> synchronousFav(LoginInfo loginInfo) throws IOException {
    List<ContentValues> favList = new ArrayList<ContentValues>();
    String urlString = "http://bbs.nju.edu.cn/" + loginInfo.getLoginCode() + "/bbsmybrd";
    HttpClient client;
    BasicHttpParams httpParameters =
        new BasicHttpParams(); // Set the timeout in milliseconds until a connection is established.
    HttpConnectionParams.setConnectionTimeout(httpParameters, 10000);
    HttpConnectionParams.setSoTimeout(httpParameters, 10000);
    client = new DefaultHttpClient(httpParameters);
    client.getParams().setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);
    HttpGet uploadGet = new HttpGet(urlString);
    uploadGet.addHeader("Cookie", loginInfo.getLoginCookie());
    HttpResponse httpResponse = client.execute(uploadGet);
    if (httpResponse.getStatusLine().getStatusCode() == 200) {
      Document doc = Jsoup.parse(EntityUtils.toString(httpResponse.getEntity()));
      Elements boards = doc.select("input[checked]");
      if (boards.size() == 0) {
        return new ArrayList<ContentValues>();
      }
      for (Element board : boards) {
        ContentValues values = new ContentValues();
        String boardName = board.nextSibling().toString();
        boardName = boardName.substring(boardName.indexOf(">") + 1);
        boardName = boardName.substring(0, boardName.indexOf("<"));
        values.put("english", boardName.substring(0, boardName.indexOf("(")));
        values.put(
            "chinese", boardName.substring(boardName.indexOf("(") + 1, boardName.length() - 1));
        values.put("islocal", 0);
        favList.add(values);
      }
    } else {
      throw new IOException();
    }

    return favList;
  }
  public int computeScore(File file, Element link, String query, URL parentUrl) throws IOException {
    if (query == null) {
      return 0;
    }

    // substring
    String anchor = link.text().toLowerCase();
    String[] queryTerms = query.split(" ");
    int K = 0;
    for (String q : queryTerms) {
      if (anchor.contains(q)) {
        K++;
      }
    }
    if (K > 0) {
      return (K * 50);
    }

    // substring
    String url = link.attr("href").toLowerCase();
    K = 0;
    for (String q : queryTerms) {
      if (url.contains(q)) {
        K++;
      }
    }
    if (K > 0) {
      return 40;
    }

    int U = 0;
    int V = 0;
    List<String> neighborWords = new ArrayList<String>();

    List<String> words = getPrevNeighbors(link.previousSibling());
    if (words != null) {
      neighborWords.addAll(words);
    }

    words = getNextNeighbors(link.nextSibling());
    if (words != null) {
      neighborWords.addAll(words);
    }

    for (String q : queryTerms) {
      if (neighborWords.contains(q)) {
        U++;
      }
    }

    BufferedReader br = new BufferedReader(new FileReader(file));
    Document doc = Jsoup.parse(file, "UTF-8", parentUrl.toString());
    String rawText = doc.text();
    String[] raw = rawText.split(" ");
    List<String> rawTextList = new ArrayList<String>();
    for (String s : raw) {
      if (!s.matches("^[a-zA-Z0-9]+$")) {
        s = s.replaceAll("[^\\p{Alpha}\\p{Digit}]+", "");
      }
      rawTextList.add(s.toLowerCase());
    }
    for (String q : queryTerms) {

      if (rawTextList.contains(q)) {
        V++;
      }
    }
    br.close();

    int score = 4 * U + Math.abs(V - U);
    return score;
  }