/** 获取文章作者,发布时间等数据 */ public void processAuthor() throws Exception { HashMap map = new HashMap(); map.put("status", 1); List<ArticleDoc> list = articleDocDao.find(map); for (ArticleDoc bean : list) { if (!bean.getUrl().startsWith("http://www.pcpop.com/doc/")) { continue; } logger.debug("获取文章数据"); // 更新文章的作者和发布时间 try { String author = author(bean.getUrl()); String tmp1 = author.substring(author.lastIndexOf("作者") + 3, author.lastIndexOf("编辑") - 1); String tmp2 = author.substring(0, author.indexOf(":") - 2); bean.setAuthor(tmp1); bean.setPublishTime(tmp2); bean.setStatus(2); if (articleDocDao.update(bean) == 0) { logger.info("更新作者失败:" + bean.getUrl()); } else { processCount++; logger.info( "[" + bean.getId() + "]更新成功:" + bean.getUrl() + "\tprocessCount:" + processCount); } } catch (java.io.FileNotFoundException e) { bean.setStatus(10); if (articleDocDao.update(bean) == 0) { logger.error("更新作者失败:" + bean.getUrl()); } else { logger.error("[" + bean.getId() + "]更新记录状态为10[文件或地址查找找不到]:" + bean.getUrl()); } continue; } catch (org.htmlparser.util.ParserException e) { bean.setStatus(10); if (articleDocDao.update(bean) == 0) { logger.error("更新作者失败:" + bean.getUrl()); } else { logger.error("[" + bean.getId() + "]更新记录状态为10[URL解析失败]:" + bean.getUrl()); } continue; } catch (Exception e) { bean.setStatus(11); bean.setContent(e.getMessage()); if (articleDocDao.update(bean) == 0) { logger.error("更新作者和文章发布时间失败:" + bean.getUrl()); } else { logger.error("[" + bean.getId() + "]更新记录状态为11[其他异常情况]:" + bean.getUrl()); } continue; } } }
/** * 根据文章内容获取最新发布的文章 * * @param webid * @param content * @throws Exception */ public void processWithDoc(int webid, String content) throws Exception { docByHTML(content, "http://www.pcpop.com/doc/"); Iterator it = LINKHASH.keySet().iterator(); ArticleDoc doc = null; while (it.hasNext()) { String key1 = (String) it.next(); if (null == articleDocCache.get(getKey(key1))) { LinkBean link = (LinkBean) LINKHASH.get(key1); doc = new ArticleDoc(); doc.setTitle(link.getName()); doc.setUrl(link.getLink()); doc.setWebId(webid); int id = articleDocDao.insert(doc); if (!(id > 0)) { logger.info("失败,\t链接名称:" + link.getName() + "\n链接地址:" + link.getLink()); } else { docCount++; doc.setId(id); doc.setStatus(1); articleDocCache.put(getKey(doc.getUrl()), doc); logger.info("processWithDoc,docCount:" + docCount); } } } LINKHASH.clear(); }
public void init() { try { List<Website> weblist = websiteDao.findByFatherId(166); logger.debug("初始化数据库数据"); HashMap map = new HashMap(); for (Website bean : weblist) { map.put("webId", bean.getId()); List<ArticleDoc> docList = articleDocDao.find(map); for (ArticleDoc doc : docList) { if (articleDocCache.get(getKey(doc.getUrl())) == null) { logger.debug("添加对象到缓存"); articleDocCache.put(getKey(doc.getUrl()), doc); } } } } catch (Exception e) { e.printStackTrace(); } }
void contentProcess() { HashMap map = new HashMap(); map.put("status", 2); try { List<ArticleDoc> list = articleDocDao.find(map); for (ArticleDoc bean : list) { if (!bean.getUrl().startsWith("http://www.pcpop.com/doc/")) { continue; } try { // String content = content(bean.getUrl()); // if(null != content){ // bean.setContent(content); bean.setStatus(3); if (articleDocDao.update(bean) == 0) { logger.debug("更新作者失败:" + bean.getUrl()); } else { if (null != client.get(getKey(bean.getUrl()))) { client.remove(getKey(bean.getUrl())); client.put(getKey(bean.getUrl()), bean); } logger.debug("[" + bean.getId() + "]更新文章内容成功"); } // } } catch (Exception e) { bean.setStatus(11); bean.setContent(e.getMessage()); if (articleDocDao.update(bean) == 0) { logger.debug("更新文章内容失败:" + bean.getUrl()); } else { logger.debug("[" + bean.getId() + "]更新记录状态为11[其他异常情况]:" + bean.getUrl()); } continue; } } } catch (Exception e) { } }
public void processWithDoc() throws Exception { // Iterator hit = HTMLHASH.keySet().iterator(); // while(hit.hasNext()){ // String key = (String) hit.next(); // logger.debug("key:"+key); // String[] keys = key.split(":"); // String content = (String) HTMLHASH.get(key); // try{ // docByHTML(content, "http://www.pcpop.com/doc/"); // Iterator it = LINKHASH.keySet().iterator(); // ArticleDoc doc = null; // while (it.hasNext()) { // String key1 = (String) it.next(); // if(null == client.get(getKey(key1))){ // LinkBean link = (LinkBean) LINKHASH.get(key1); // doc = new ArticleDoc(); // doc.setTitle(link.getName()); // doc.setUrl(link.getLink()); // doc.setWebId(Integer.valueOf(keys[0])); // int id = articleDocDao.insert(doc); // if(!(id>0)){ // logger.debug("失败,\t链接名称:" + link.getName() + "\n链接地址:"+ // link.getLink()); // }else{ // doc.setId(id); // doc.setStatus(1); // client.add(getKey(doc.getUrl()), doc); // logger.debug("Memcached now store this object"); // } // } // } // LINKHASH.clear(); // }catch(Exception e){ // logger.debug("解析异常,跳过:"+key+"\tException:"+e.getMessage()); // continue; // } // } HashMap map = new HashMap(); map.put("status", 1); List<ArticleDoc> list = articleDocDao.find(map); for (ArticleDoc bean : list) { if (!bean.getUrl().startsWith("http://www.pcpop.com/doc/")) { continue; } logger.debug("获取文章数据"); // 更新文章的作者和发布时间 try { String author = author(bean.getUrl()); String tmp1 = author.substring(author.lastIndexOf("作者") + 3, author.lastIndexOf("编辑") - 1); String tmp2 = author.substring(0, author.indexOf(":") - 2); bean.setAuthor(tmp1); bean.setPublishTime(tmp2); bean.setStatus(2); if (articleDocDao.update(bean) == 0) { logger.debug("更新作者失败:" + bean.getUrl()); } else { logger.debug("[" + bean.getId() + "]更新成功:" + bean.getUrl()); } } catch (java.io.FileNotFoundException e) { bean.setStatus(10); if (articleDocDao.update(bean) == 0) { logger.debug("更新作者失败:" + bean.getUrl()); } else { logger.debug("[" + bean.getId() + "]更新记录状态为10[文件或地址查找找不到]:" + bean.getUrl()); } continue; } catch (org.htmlparser.util.ParserException e) { bean.setStatus(10); if (articleDocDao.update(bean) == 0) { logger.debug("更新作者失败:" + bean.getUrl()); } else { logger.debug("[" + bean.getId() + "]更新记录状态为10[URL解析失败]:" + bean.getUrl()); } continue; } catch (Exception e) { bean.setStatus(11); bean.setContent(e.getMessage()); if (articleDocDao.update(bean) == 0) { logger.debug("更新作者和文章发布时间失败:" + bean.getUrl()); } else { logger.debug("[" + bean.getId() + "]更新记录状态为11[其他异常情况]:" + bean.getUrl()); } continue; } } }