private PhoneResult mineAgentList(SearchResultItem item) { if (!isMinableAgentList(item)) return null; String html = getHtml(item.getRealUrl()); if (TextUtils.isEmpty(html) || !html.contains(item.getPhone())) return null; List<String> list = TextUtils.getList(html, "<div id='list_", "<div class=\"clear\">"); if (list != null && list.size() > 0) { for (String l : list) { if (l.contains(item.getPhone())) { String agent_href = TextUtils.getSubString(l, "<a href='", "'"); if (!TextUtils.isEmpty(agent_href) && agent_href.startsWith("/")) { String domain = TextUtils.getMatchGroup(PATTERN_AGENT_LIST, item.getRealUrl()); if (!TextUtils.isEmpty(domain)) { String aLink = domain + agent_href; return mineAgent(aLink); } } break; } } } return null; }
private boolean isMinableAgentList(SearchResultItem item) { return TextUtils.isMatchReg(PATTERN_AGENT_LIST, item.getRealUrl()); }
private PhoneResult mineAgent(String url) { if (!isMinableAgent(url)) return null; PhoneResult phoneResult = new PhoneResult(); String html = getHtml(url); if (TextUtils.isEmpty(html)) return null; String floatHtml = TextUtils.getSubString(html, "<div class=\"Floating\">", "</div>"); if (!TextUtils.isEmpty(floatHtml)) { String TAG_COM = "<dd style=\"padding-top:2px\\9;*padding-top:3px;\" >"; String company = TextUtils.getSubString(floatHtml, TAG_COM, "</dd>"); phoneResult.setJigou(company); String name = TextUtils.getSubString(floatHtml, "<dd>", " "); if (!TextUtils.isEmpty(name)) phoneResult.setChenghu(TextUtils.appendJob(name, "房产经纪")); String rz = TextUtils.getSubString(html, "<div class=\"rzren\">", "</div>"); if (!TextUtils.isEmpty(rz)) { String avartar = TextUtils.getSubString(rz, "<img src=\"", "\""); phoneResult.setImage(avartar); } String address = TextUtils.getSubString(html, "<li>地<span class=\"pl24\">址</span>:", "</li>"); phoneResult.setAddress(address); List<String> hangyeList = new ArrayList<String>(); hangyeList.add("房屋中介"); phoneResult.setHangyeList(hangyeList); } // 如果没有人头像,就试着用公司logo if (TextUtils.isEmpty(phoneResult.getImage())) { String componyLogoHtml = TextUtils.getSubString(html, "<li class=\"companylogo\">", "</li>"); String componyLogo = TextUtils.getSubString(componyLogoHtml, "<img src=\"", "\""); phoneResult.setImage(componyLogo); } // 如果公司头像也没有,可能网页是另外一种格式的,如: if (TextUtils.isEmpty(phoneResult.getImage())) { String photoHtml = TextUtils.getSubString(html, "<div class=\"photo\">", "</div>"); if (!TextUtils.isEmpty(photoHtml)) { String avartar = TextUtils.getSubString(photoHtml, "<img src=\"", "\""); phoneResult.setImage(avartar); } } if (phoneResult.isFound()) return phoneResult; else return null; }
@Override public boolean isMinableDomain(SearchResultItem item) { return TextUtils.isMatchReg(PATTERN_DOMAIN, item.getDomain()); }
private boolean isMinableFang(SearchResultItem item) { return TextUtils.isMatchReg(PATTERN_FANG, item.getRealUrl()); }
private PhoneResult mineFang(SearchResultItem item) { String html = getHtml(item); if (TextUtils.isEmpty(html)) return null; String link = TextUtils.getMatchGroup(PATTERN_AGENT_LINK, html); System.out.println("link=" + link); if (!TextUtils.isEmpty(link)) { return mineAgent(link); } if (html.contains("100%个人房源")) { PhoneResult phoneResult = new PhoneResult(); String chenghu = TextUtils.getSubString(html, "<span class=\"name floatl\" id=\"Span2\">", "</span>"); if (!TextUtils.isEmpty(chenghu)) { chenghu = chenghu.trim(); chenghu = TextUtils.clearHuanhang(chenghu); if (!TextUtils.isEmpty(chenghu)) phoneResult.setChenghu(TextUtils.appendJob(chenghu, "房东")); List<String> hangyeList = new ArrayList<String>(); hangyeList.add("房屋出租或出售"); phoneResult.setHangyeList(hangyeList); } // 房产图片 String imageHtml = TextUtils.getSubString(html, "<div class=\"slider\" id=\"thumbnail\">", "</div>"); if (!TextUtils.isEmpty(imageHtml)) { String avartar = TextUtils.getSubString(imageHtml, "<img src=\"", "\""); phoneResult.setImage(avartar); } if (phoneResult.isFound()) return phoneResult; } return null; }
private boolean isMinableAgent(String url) { return TextUtils.isMatchReg(PATTERN_AGENT, url) || TextUtils.isMatchReg(PATTERN_AGENT_SUBPAGE, url); }